aboutsummaryrefslogtreecommitdiffstats
path: root/webstats/logparse.lisp
blob: 71864d85126e633ae11cb8a1bffa9803674f5241 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
(ql:quickload 'cl-ppcre)
(ql:quickload 'sqlite)
(ql:quickload 'cl-yesql)
(ql:quickload 'cl-yesql/sqlite)
(ql:quickload "gzip-stream")

(defparameter *log-line* "(.+?) (.+?) (.+?) \\[(.+?)\\] \"(.+?) (.+?) (HTTP/.*)?\" (.+?) (.+?) \"(.+?)\" \"(.+?)\"")

(defparameter +short-month-names+
  (alexandria:plist-hash-table
   '("Jan" 1 "Feb" 2 "Mar" 3 "Apr" 4 "May" 5 "Jun" 6 "Jul" 7 "Aug" 8 "Sep" 9 "Oct" 10 "Nov" 11 "Dec" 12)
   :test #'equal))

(defvar *log-files*
  (sort
   (remove-if-not
    (lambda (str) (ppcre:scan "^cgit_access" str))
    (uiop:directory-files #P"/scratch/titan/apache2/")
    :key #'pathname-name)
   #'<
   :key (lambda (filename)
          (sb-posix:stat-mtime
           (sb-posix:stat filename)))))

(defun time-to-posix (timestr)
  (ppcre:register-groups-bind ((#'parse-integer date) month (#'parse-integer year hour min sec))
      ("(\\d{2})/(\\w{3})/(\\d{4}):(\\d{2}):(\\d{2}):(\\d{2})" timestr)
    (-
     (encode-universal-time sec min hour date (gethash month +short-month-names+) year)
     #.(encode-universal-time 0 0 0 1 1 1970 0))))

(= (time-to-posix "02/Mar/2025:00:00:16 +0000") 1740870016)

(ppcre:register-groups-bind
    (ip remote-log userid (#'time-to-posix date) method path version (#'parse-integer status length) referrer agent)
    (*log-line*
     "20.171.207.185 - - [02/Mar/2025:00:00:16 +0000] \"GET /pub/hi/scratch/plain/AoC2023/day02/input?h=simpler&id=e2f5b6c2d5bb67013ba1b612252781e4cd9b6fe1 HTTP/1.1\" 200 7017 \"-\" \"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)\"")
  (list ip remote-log userid date method path version status length referrer agent))


(defvar *sqlite*
  (sqlite:connect "/scratch/titan/apache2/cgit-logs.sqlite"))
(sqlite:disconnect *sqlite*)

(with-open-file (in #P"/scratch/titan/cgit_access.log")
  (read-line in)
  (ppcre:register-groups-bind
      (ip remote-log userid  date method path version (#'parse-integer status length) referrer agent)
      (*log-line* (read-line in))

    can this be simplified?
    (let ((ipid
            (or (sqlite:execute-single *sqlite* "select id from ip where addr = ?" ip)
                (sqlite:execute-single *sqlite* "insert into ip (addr) values (?) returning id" ip)))
          (agentid
            (or (sqlite:execute-single *sqlite* "select id from agent where user_agent = ?" agent)
                (sqlite:execute-single *sqlite* "insert into agent (user_agent) values (?) returning id" agent))))


      ;; (sqlite:execute-single *sqlite* "select id from agent where user_agent = ?" agent)
      (list
       (list ip ipid remote-log userid date (time-to-posix date) method path version status length referrer agent agentid)))))

;; PRAGMA synchronous = 0;
;; PRAGMA journal_mode=WAL;
;; PRAGMA cache_size = 1000000;
;; PRAGMA locking_mode = EXCLUSIVE;
;; PRAGMA temp_store = MEMORY;

(yesql:import log-queries
  :from "/home/titan/dev/scratch/webstats/logs.sql"
  :as :cl-yesql/sqlite
  :binding :all-functions)

(create-log-table *sqlite*)
(create-agent-table *sqlite*)
(create-ip-table *sqlite*)

(defun insert-line (db line)
  (ppcre:register-groups-bind
      (ip remote-log userid (#'time-to-posix date) method path version (#'parse-integer status length) referrer agent)
      (*log-line* line)
    (insert-log db ip remote-log userid date method path version status length referrer agent)))

(defun insert-from (stream)
  (loop for line = (read-line stream nil nil)
        while line
        do (insert-line *sqlite* line)))

(time
 (sqlite:with-transaction *sqlite*
   (with-open-file (in #P"/scratch/titan/apache2/cgit_access.log")
     (insert-from in))))


(loop for f in *log-files*
      when (ppcre:scan "gz$" (pathname-type f))
        do
           (print f)
           (time
            (sqlite:with-transaction *sqlite*
              (gzip-stream:with-open-gzip-file (in f)
                (insert-from in)))))


(sqlite:execute-non-query *sqlite* "insert into agent (user_agent) select distinct user_agent from logs")
(sqlite:execute-non-query *sqlite* "alter table logs add column agentid integer")
(sqlite:execute-non-query *sqlite* "update logs set agentid = (select id from agent where logs.user_agent = agent.user_agent)")
(sqlite:execute-non-query *sqlite* "alter table logs drop column user_agent")

(sqlite:execute-non-query *sqlite* "insert into ip (addr) select distinct ip from logs")
(sqlite:execute-non-query *sqlite* "alter table logs add column ipid integer")
(sqlite:execute-non-query *sqlite* "update logs set ipid = (select id from ip where logs.ip = ip.addr)")
(sqlite:execute-non-query *sqlite* "alter table logs drop column ip")

(time
 (sqlite:execute-to-list *sqlite* "select * from ip where addr = '18.227.13.136'"))



(sqlite:execute-single *sqlite* "select count(*) from ip")