From 20fb1a21587dc72d5e314119d6986fd3f2a45f13 Mon Sep 17 00:00:00 2001 From: Oscar Najera Date: Mon, 31 Jul 2023 21:51:44 +0200 Subject: scratch sem grep --- scratch/semgrep/parse.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++ scratch/semgrep/test.org | 20 +++++++++ 2 files changed, 129 insertions(+) create mode 100644 scratch/semgrep/parse.py create mode 100644 scratch/semgrep/test.org diff --git a/scratch/semgrep/parse.py b/scratch/semgrep/parse.py new file mode 100644 index 0000000..3846e3e --- /dev/null +++ b/scratch/semgrep/parse.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +import orgparse +import pandas as pd +import re + + +def org_roam_nodes_to_dataframe(org_file): + # Load the org file into an OrgData object + org_data = orgparse.load(org_file) + + # Define a function to extract the title of a node + def extract_title(node): + if node.heading: + # If the node has a heading, return it + return node.heading + else: + # Otherwise, extract the title from the org file using a regular expression + title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE) + match = title_pattern.search(node.body) + if match: + return match.group(1) + else: + # If the title is not found, extract it from the first line of the body + return re.sub( + r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE + ).strip() + + # Define a function to recursively extract the bodies of a node and its descendants + def extract_node_nested_body(node): + body = node.body + for child in node.children: + body += ( + "\n" + + child.level * "*" + + " " + + child.heading + + "\n" + + extract_node_nested_body(child) + ) + return body.strip() + + # Define a function to recursively extract the bodies of a node + # and its descendants when they are not other nodes + def extract_node_nested_body_exclusive(node): + body = node.body + for child in node.children: + if not child.properties.get("ID") and not child.properties.get("SEARCH"): + body += ( + "\n" + + child.level * "*" + + " " + + child.heading + + "\n" + + extract_node_nested_body_exclusive(child) + ) + return body.strip() + + # Define a function to build the hierarchy of a node + def build_node_hierarchy(node): + hierarchy = [extract_title(node)] + parent = node.parent + + # while parent and parent != org_data[0]: + while parent: + hierarchy.append(extract_title(parent)) + parent = parent.parent + return " > ".join(reversed(hierarchy)).strip() + + # Define a function to convert a node to a dictionary + def node_to_dict(node, file_name): + node_dict = { + "file_name": file_name, + "node_id": node.properties.get("ID"), + "node_title": extract_title(node), + "node_hierarchy": build_node_hierarchy(node), + "node_text": node.body, + "node_text_nested": extract_node_nested_body(node), + "node_text_nested_exclusive": extract_node_nested_body_exclusive(node), + } + + return node_dict + + # Create a list of all org-roam nodes in the OrgData object + nodes = [ + node_to_dict(node, org_file) + for node in org_data[0][:] + if node.properties.get("ID") + ] + + return pd.DataFrame(nodes) + + +model.encode( + "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them." +) +model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1") + +query_embedding = model.encode("How big is London") +passage_embedding = model.encode( + [ + "London has 9,787,426 inhabitants at the 2011 census", + "London is known for its finacial district", + "London is full of criminals", + "Cairo is small", + ] +) + +print("Similarity:", util.dot_score(query_embedding, passage_embedding)) diff --git a/scratch/semgrep/test.org b/scratch/semgrep/test.org new file mode 100644 index 0000000..aac16d0 --- /dev/null +++ b/scratch/semgrep/test.org @@ -0,0 +1,20 @@ +#+title: Test + +It has started all wars. + +No more data. It is a case of food suply. No more data. It is a case of food +supply and nothing else for the. + +How an that + +#+begin_src emacs-lisp :results value raw +(json-encode + (org-element-map + (org-element-parse-buffer) + 'paragraph + (lambda (hl) + (car + (org-element-contents hl))))) +#+end_src + +#+RESULTS: -- cgit v1.2.3