2 files changed, 146 insertions, 58 deletions
diff --git a/webstats/logparse.lisp b/webstats/logparse.lisp
index 71864d8..b7e0577 100644
--- a/webstats/logparse.lisp
+++ b/webstats/logparse.lisp
@@ -26,7 +26,7 @@
   (ppcre:register-groups-bind ((#'parse-integer date) month (#'parse-integer year hour min sec))
       ("(\\d{2})/(\\w{3})/(\\d{4}):(\\d{2}):(\\d{2}):(\\d{2})" timestr)
     (-
-     (encode-universal-time sec min hour date (gethash month +short-month-names+) year)
+     (encode-universal-time sec min hour date (gethash month +short-month-names+) year 0)
      #.(encode-universal-time 0 0 0 1 1 1970 0))))
 
 (= (time-to-posix "02/Mar/2025:00:00:16 +0000") 1740870016)
@@ -37,7 +37,6 @@
      "20.171.207.185 - - [02/Mar/2025:00:00:16 +0000] \"GET /pub/hi/scratch/plain/AoC2023/day02/input?h=simpler&id=e2f5b6c2d5bb67013ba1b612252781e4cd9b6fe1 HTTP/1.1\" 200 7017 \"-\" \"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)\"")
   (list ip remote-log userid date method path version status length referrer agent))
 
-
 (defvar *sqlite*
   (sqlite:connect "/scratch/titan/apache2/cgit-logs.sqlite"))
 (sqlite:disconnect *sqlite*)
diff --git a/webstats/workforrobots.org b/webstats/workforrobots.org
index 2c8c542..76e7ddd 100644
--- a/webstats/workforrobots.org
+++ b/webstats/workforrobots.org
@@ -6,21 +6,39 @@
 #+HTML_HEAD_EXTRA: <script src="static/uPlot.iife.min.js"></script>
 #+HTML_HEAD_EXTRA: <script src="plots.js"></script>
 
-I self-host my website to keep sovereignty and independence from large Internet
-corporations. I don't have at the moment any analytics tool on this site.
-However, some of my git repositories are also self-hosted, and there I have
-access to the =Apache= logs. With all the =AI-hype= and coding assistants, I
-wanted to have a look at how of my =software= are those AI companies taking.
-
-Analyzing my log files in the period from =2024-12-31 23:00:35= till =2025-04-20
-00:38:24= I'm surprised to see how dumb those AI companies are in the use of
-their web crawlers.
-
-Table [[top-users]] shows the top /users/ of my git repository. The leading AI
-companies =OpenAI= and =Anthropic= with their respective bots =GPTBot= and
-=ClaudeBot= simply dominate. I found it unbelievable that they could extract
-about =≈7GiB= of data. That is a lot of Bandwidth out of my server for a few git
-repositories and in a lightweight web interphase.
+
+I self-host some of my git repositories to keep sovereignty and independence
+from large Internet corporations. The public facing repositories are for
+everybody, and today that means for robots. With the =AI-hype= on coding
+assistants, I wanted to have a look at what are those AI companies taking. It is
+worse than everything, it is idiotically everything. They can't recognize that
+they are parsing git repositories and use the appropriate way of downloading
+them.
+
+#+begin_src sqlite :exports none
+SELECT
+    min(date),
+    min(datetime (date, 'unixepoch')),
+    min(datetime (date, 'unixepoch', 'localtime')),
+    max(date),
+    max(datetime (date, 'unixepoch')),
+    max(datetime (date, 'unixepoch', 'localtime'))
+FROM
+    logs
+#+end_src
+
+#+RESULTS[8f773b86167f2d36db335568f344063f13838a11]:
+|  min(date) | min(datetime (date, 'unixepoch')) | min(datetime (date, 'unixepoch', 'localtime')) |  max(date) | max(datetime (date, 'unixepoch')) | max(datetime (date, 'unixepoch', 'localtime')) |
+|------------+-----------------------------------+------------------------------------------------+------------+-----------------------------------+------------------------------------------------|
+| 1735686035 | 2024-12-31 23:00:35               | 2025-01-01 00:00:35                            | 1745109504 | 2025-04-20 00:38:24               | 2025-04-20 02:38:24                            |
+* Who is visiting
+I analyzed the =Apache= log files of my =cgit= service in the period from
+=2025-01-01= till =2025-04-20=. Table [[top-users]] shows the top /users/ of public
+facing git repository. The leading AI companies =OpenAI= and =Anthropic= with
+their respective bots =GPTBot= and =ClaudeBot= simply dominate. I found it
+unbelievable that they could extract about =≈7GiB= of data each. That is a lot
+of Bandwidth out of my server for a few git repositories and in a lightweight
+web interphase.
 
 #+begin_src sqlite :exports results
 --SELECT
@@ -67,6 +85,12 @@ LIMIT 10
 |     2500 |   53.7 | Mozilla/5.0 (compatible; SeekportBot; +https://bot.seekport.com)                                                                                                      |
 |     3578 |   30.9 | Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.7049.52 Mobile Safari/537.36 (compatible; GoogleOther)  |
 
+That is the total. What does it look like as a function of time? Figure
+[[fig:agent-traffic]] shows the load on CGit frontend service by each visiting
+agent. Hover over the plot to read the exact value for each agent at a given
+time on the legend. You can highlight a specific curve by hovering over it or
+its legend.
+
 #+begin_src sqlite :results value file :file top_agent_traffic.csv :exports none
 SELECT
     date / 14400 * 14400 AS time, -- 4h bin
@@ -94,58 +118,54 @@ GROUP BY
 #+RESULTS[87f78b2de4d43785f81e682925c6b6542d794883]:
 [[file:top_agent_traffic.csv]]
 
-This is reviewed in figure [[fig:agent-traffic]]
-
 #+attr_html: :id agent-traffic
-#+CAPTION: Example of the process
+#+CAPTION: Load on CGit frontend service by each visiting agent. The first black dashed line shows the total request at the server and uses the right axis scale. All other solid-filled lines, use the left axis and represent the bandwidth usage.
 #+NAME: fig:agent-traffic
 [[./jsfill.png]]
 
-#+caption: Caption shared by both figures
-#+begin_quote
-teusta sat
-#+end_quote
+You can see how aggressively the =ClaudeBot= scrapes pages, consuming a lot of
+bandwidth is a /short/ time. =OpenAI-GPTBot= works show its rate-limitation, and
+performs its scraping over a /longer/ period of time. However, as seen in table
+[[top-users]], it performs more than twice the amount of request consumes =30%= more
+bandwidth.
 
-following theorem [[th:theproc]]
+The rest of the visitors are bots too. =Barkrowler= is a regular visitor
+gathering metrics for online marketing. =AhrefsBot= is of the same type, yet
+started crawling in March. =Macintosh= is certainly a bot hiding itself as a
+browser and constantly probing. =Scrappy= also unknown, came at the start of the
+year and never came back.
 
-#+CAPTION: Example of the process
-#+NAME: th:theproc
-#+attr_html: :style display:flex; :id fun
-#+begin_theorem
-If an integer $n$ is greater than 2, then the equation $a^n + b^n = c^n$
-has no solutions in non-zero integers $a$, $b$, and $c$.
-#+end_theorem
+=PetalBot= is for a search engine with AI recommendation by =Huawei=, it lingers
+and slowly scrapes everything. =Seekport= is a search engine, it came all of a
+sudden, took as much as it found useful, =<1%= of what the big AI bot take, and
+swiftly left again.
 
-look fig [[figblo]]
+=Bytespider= is almost background noise, but it is also to train an LLM, this
+time for ByteDance, the Chinese owner of TikTok.
 
-#+NAME: figblo
-#+attr_html: :class tusi figblo
-#+CAPTION: fun block
-#+begin_figure
-satore
-#+end_figure
+The last one =Google= doesn't even seem to be the bot for indexing its search
+engine, but rather one to test its chrome browser and how it renders pages.
 
+=Rest= is all the remaining /robots/ or /users/. The have consumed around
+=≈400MiB=, placing them in aggregate in a behavior like =Macintosh, Scrapy,
+PetalBot & AhrefsBot=.
 
-#+begin_src sqlite
-SELECT
-    round(total (length) / 1024, 2) "tx KiB",
-    count(*) hits,
-    agentid,
-    user_agent
-FROM
-    logs
-    JOIN agent ON agentid = id
-    --where agentid in (
-    --select id from agent
-WHERE
-    user_agent LIKE '%google%'
-    --)
-GROUP BY
-    id
-ORDER BY
-    1 DESC
-#+end_src
+* How should they visit?
+
+This is a collection of =git= repositories. Yet, you can browse some of my code,
+some files, that is it. Yet, when you want *everything*, the correct use of this
+service is through the =git= client, downloading my publicly available software.
 
+That makes the data a lot more useful, even for those AI companies as the data
+cleanup would be easier. They, themselves should use their AI to recognize what
+kind of page they are vising and act accordingly instead of stupidly scraping
+everything.
+
+How have the good citizens behaved? That is on table [[git-users]]. The =Software
+Heritage= keeps mirrors of git repositories. It thus watches for updates and the
+downloads. There are other people besides them that downloaded, but in total, in
+the same time they only downloaded =≈21MiB=. That is =0.3%= compared to
+=ClaudeBot=.
 
 #+begin_src sqlite
 SELECT
@@ -164,6 +184,8 @@ GROUP BY
 order by total(length) desc
 #+end_src
 
+#+name: git-users
+#+caption: Git users
 #+RESULTS[333fcbc738819c497f14b4445a3b45f391f0db7e]:
 | User Agent                                                                       | hits |  tx KiB |
 |----------------------------------------------------------------------------------+------+---------|
@@ -178,6 +200,73 @@ order by total(length) desc
 |----------------------------------------------------------------------------------+------+---------|
 | Total                                                                            | 2695 | 21005.6 |
 
+* What are they looking at?
+The web front end of git repositories of course, but it there a pattern?
+
+#+begin_src sqlite
+#+end_src
+
+#+RESULTS[5c52f5ea84c4e903cbbbf8dd8f73139532de16dc]:
+
+
+#+begin_src sqlite
+select count(*), total(length), status
+from logs
+where agentid = 4602
+group by status
+#+end_src
+
+#+RESULTS[91700bd4c58675dde008a24f8e07cfcea3966f58]:
+| count(*) | total(length) | status |
+|----------+---------------+--------|
+|   207240 |   497513096.0 |    200 |
+|      348 |     1195624.0 |    404 |
+|      183 |      186683.0 |    500 |
+#+begin_src sqlite
+select count(distinct ipid)
+, count(distinct agentid)
+from logs
+where
+agentid not in (143, 1, 19, 6, 4602, 3, 2, 4, 10306, 9)
+#+end_src
+
+#+RESULTS[efe7c2c9c481b475baccee8d9ed48d814003b23e]:
+| count(distinct ipid) | count(distinct agentid) |
+|----------------------+-------------------------|
+|                35144 |                   20648 |
+
+#+begin_src sqlite
+select count(distinct ipid),
+max(date),
+ max(datetime (date, 'unixepoch'))
+from logs
+where agentid = 4602
+#+end_src
+
+#+RESULTS[49b56e0faa552ea7df0d2515250f8f16365a7a40]:
+| count(distinct ipid) |  max(date) | max(datetime (date, 'unixepoch')) |
+|----------------------+------------+-----------------------------------|
+|                    7 | 1742216884 | 2025-03-17 13:08:04               |
+
+#+begin_src sqlite
+SELECT
+    round(total (length) / 1024, 2) "tx KiB",
+    count(*) hits,
+    agentid,
+    user_agent
+FROM
+    logs
+    JOIN agent ON agentid = id
+    --where agentid in (
+    --select id from agent
+WHERE
+    user_agent LIKE '%google%'
+    --)
+GROUP BY
+    id
+ORDER BY
+    1 DESC
+#+end_src