aboutsummaryrefslogtreecommitdiffstats
path: root/webstats/logparse.lisp
diff options
context:
space:
mode:
authorOscar Najera <hi@oscarnajera.com>2025-04-27 22:35:56 +0200
committerOscar Najera <hi@oscarnajera.com>2025-04-27 22:36:09 +0200
commit78af58cfd32e3f9e7cd5c36716fd2d971e01102b (patch)
tree491d77417f0fdb2404863d1e7007d72a1db815ed /webstats/logparse.lisp
parentb46f51a0b3e2e1eb4262c393ac9468802de86c67 (diff)
downloadscratch-78af58cfd32e3f9e7cd5c36716fd2d971e01102b.tar.gz
scratch-78af58cfd32e3f9e7cd5c36716fd2d971e01102b.tar.bz2
scratch-78af58cfd32e3f9e7cd5c36716fd2d971e01102b.zip
parsing apache logs
Diffstat (limited to 'webstats/logparse.lisp')
-rw-r--r--webstats/logparse.lisp121
1 files changed, 121 insertions, 0 deletions
diff --git a/webstats/logparse.lisp b/webstats/logparse.lisp
new file mode 100644
index 0000000..71864d8
--- /dev/null
+++ b/webstats/logparse.lisp
@@ -0,0 +1,121 @@
+(ql:quickload 'cl-ppcre)
+(ql:quickload 'sqlite)
+(ql:quickload 'cl-yesql)
+(ql:quickload 'cl-yesql/sqlite)
+(ql:quickload "gzip-stream")
+
+(defparameter *log-line* "(.+?) (.+?) (.+?) \\[(.+?)\\] \"(.+?) (.+?) (HTTP/.*)?\" (.+?) (.+?) \"(.+?)\" \"(.+?)\"")
+
+(defparameter +short-month-names+
+ (alexandria:plist-hash-table
+ '("Jan" 1 "Feb" 2 "Mar" 3 "Apr" 4 "May" 5 "Jun" 6 "Jul" 7 "Aug" 8 "Sep" 9 "Oct" 10 "Nov" 11 "Dec" 12)
+ :test #'equal))
+
+(defvar *log-files*
+ (sort
+ (remove-if-not
+ (lambda (str) (ppcre:scan "^cgit_access" str))
+ (uiop:directory-files #P"/scratch/titan/apache2/")
+ :key #'pathname-name)
+ #'<
+ :key (lambda (filename)
+ (sb-posix:stat-mtime
+ (sb-posix:stat filename)))))
+
+(defun time-to-posix (timestr)
+ (ppcre:register-groups-bind ((#'parse-integer date) month (#'parse-integer year hour min sec))
+ ("(\\d{2})/(\\w{3})/(\\d{4}):(\\d{2}):(\\d{2}):(\\d{2})" timestr)
+ (-
+ (encode-universal-time sec min hour date (gethash month +short-month-names+) year)
+ #.(encode-universal-time 0 0 0 1 1 1970 0))))
+
+(= (time-to-posix "02/Mar/2025:00:00:16 +0000") 1740870016)
+
+(ppcre:register-groups-bind
+ (ip remote-log userid (#'time-to-posix date) method path version (#'parse-integer status length) referrer agent)
+ (*log-line*
+ "20.171.207.185 - - [02/Mar/2025:00:00:16 +0000] \"GET /pub/hi/scratch/plain/AoC2023/day02/input?h=simpler&id=e2f5b6c2d5bb67013ba1b612252781e4cd9b6fe1 HTTP/1.1\" 200 7017 \"-\" \"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)\"")
+ (list ip remote-log userid date method path version status length referrer agent))
+
+
+(defvar *sqlite*
+ (sqlite:connect "/scratch/titan/apache2/cgit-logs.sqlite"))
+(sqlite:disconnect *sqlite*)
+
+(with-open-file (in #P"/scratch/titan/cgit_access.log")
+ (read-line in)
+ (ppcre:register-groups-bind
+ (ip remote-log userid date method path version (#'parse-integer status length) referrer agent)
+ (*log-line* (read-line in))
+
+ can this be simplified?
+ (let ((ipid
+ (or (sqlite:execute-single *sqlite* "select id from ip where addr = ?" ip)
+ (sqlite:execute-single *sqlite* "insert into ip (addr) values (?) returning id" ip)))
+ (agentid
+ (or (sqlite:execute-single *sqlite* "select id from agent where user_agent = ?" agent)
+ (sqlite:execute-single *sqlite* "insert into agent (user_agent) values (?) returning id" agent))))
+
+
+ ;; (sqlite:execute-single *sqlite* "select id from agent where user_agent = ?" agent)
+ (list
+ (list ip ipid remote-log userid date (time-to-posix date) method path version status length referrer agent agentid)))))
+
+;; PRAGMA synchronous = 0;
+;; PRAGMA journal_mode=WAL;
+;; PRAGMA cache_size = 1000000;
+;; PRAGMA locking_mode = EXCLUSIVE;
+;; PRAGMA temp_store = MEMORY;
+
+(yesql:import log-queries
+ :from "/home/titan/dev/scratch/webstats/logs.sql"
+ :as :cl-yesql/sqlite
+ :binding :all-functions)
+
+(create-log-table *sqlite*)
+(create-agent-table *sqlite*)
+(create-ip-table *sqlite*)
+
+(defun insert-line (db line)
+ (ppcre:register-groups-bind
+ (ip remote-log userid (#'time-to-posix date) method path version (#'parse-integer status length) referrer agent)
+ (*log-line* line)
+ (insert-log db ip remote-log userid date method path version status length referrer agent)))
+
+(defun insert-from (stream)
+ (loop for line = (read-line stream nil nil)
+ while line
+ do (insert-line *sqlite* line)))
+
+(time
+ (sqlite:with-transaction *sqlite*
+ (with-open-file (in #P"/scratch/titan/apache2/cgit_access.log")
+ (insert-from in))))
+
+
+(loop for f in *log-files*
+ when (ppcre:scan "gz$" (pathname-type f))
+ do
+ (print f)
+ (time
+ (sqlite:with-transaction *sqlite*
+ (gzip-stream:with-open-gzip-file (in f)
+ (insert-from in)))))
+
+
+(sqlite:execute-non-query *sqlite* "insert into agent (user_agent) select distinct user_agent from logs")
+(sqlite:execute-non-query *sqlite* "alter table logs add column agentid integer")
+(sqlite:execute-non-query *sqlite* "update logs set agentid = (select id from agent where logs.user_agent = agent.user_agent)")
+(sqlite:execute-non-query *sqlite* "alter table logs drop column user_agent")
+
+(sqlite:execute-non-query *sqlite* "insert into ip (addr) select distinct ip from logs")
+(sqlite:execute-non-query *sqlite* "alter table logs add column ipid integer")
+(sqlite:execute-non-query *sqlite* "update logs set ipid = (select id from ip where logs.ip = ip.addr)")
+(sqlite:execute-non-query *sqlite* "alter table logs drop column ip")
+
+(time
+ (sqlite:execute-to-list *sqlite* "select * from ip where addr = '18.227.13.136'"))
+
+
+
+(sqlite:execute-single *sqlite* "select count(*) from ip")