aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--webstats/logparse.lisp3
-rw-r--r--webstats/workforrobots.org201
2 files changed, 146 insertions, 58 deletions
diff --git a/webstats/logparse.lisp b/webstats/logparse.lisp
index 71864d8..b7e0577 100644
--- a/webstats/logparse.lisp
+++ b/webstats/logparse.lisp
@@ -26,7 +26,7 @@
(ppcre:register-groups-bind ((#'parse-integer date) month (#'parse-integer year hour min sec))
("(\\d{2})/(\\w{3})/(\\d{4}):(\\d{2}):(\\d{2}):(\\d{2})" timestr)
(-
- (encode-universal-time sec min hour date (gethash month +short-month-names+) year)
+ (encode-universal-time sec min hour date (gethash month +short-month-names+) year 0)
#.(encode-universal-time 0 0 0 1 1 1970 0))))
(= (time-to-posix "02/Mar/2025:00:00:16 +0000") 1740870016)
@@ -37,7 +37,6 @@
"20.171.207.185 - - [02/Mar/2025:00:00:16 +0000] \"GET /pub/hi/scratch/plain/AoC2023/day02/input?h=simpler&id=e2f5b6c2d5bb67013ba1b612252781e4cd9b6fe1 HTTP/1.1\" 200 7017 \"-\" \"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)\"")
(list ip remote-log userid date method path version status length referrer agent))
-
(defvar *sqlite*
(sqlite:connect "/scratch/titan/apache2/cgit-logs.sqlite"))
(sqlite:disconnect *sqlite*)
diff --git a/webstats/workforrobots.org b/webstats/workforrobots.org
index 2c8c542..76e7ddd 100644
--- a/webstats/workforrobots.org
+++ b/webstats/workforrobots.org
@@ -6,21 +6,39 @@
#+HTML_HEAD_EXTRA: <script src="static/uPlot.iife.min.js"></script>
#+HTML_HEAD_EXTRA: <script src="plots.js"></script>
-I self-host my website to keep sovereignty and independence from large Internet
-corporations. I don't have at the moment any analytics tool on this site.
-However, some of my git repositories are also self-hosted, and there I have
-access to the =Apache= logs. With all the =AI-hype= and coding assistants, I
-wanted to have a look at how of my =software= are those AI companies taking.
-
-Analyzing my log files in the period from =2024-12-31 23:00:35= till =2025-04-20
-00:38:24= I'm surprised to see how dumb those AI companies are in the use of
-their web crawlers.
-
-Table [[top-users]] shows the top /users/ of my git repository. The leading AI
-companies =OpenAI= and =Anthropic= with their respective bots =GPTBot= and
-=ClaudeBot= simply dominate. I found it unbelievable that they could extract
-about =≈7GiB= of data. That is a lot of Bandwidth out of my server for a few git
-repositories and in a lightweight web interphase.
+
+I self-host some of my git repositories to keep sovereignty and independence
+from large Internet corporations. The public facing repositories are for
+everybody, and today that means for robots. With the =AI-hype= on coding
+assistants, I wanted to have a look at what are those AI companies taking. It is
+worse than everything, it is idiotically everything. They can't recognize that
+they are parsing git repositories and use the appropriate way of downloading
+them.
+
+#+begin_src sqlite :exports none
+SELECT
+ min(date),
+ min(datetime (date, 'unixepoch')),
+ min(datetime (date, 'unixepoch', 'localtime')),
+ max(date),
+ max(datetime (date, 'unixepoch')),
+ max(datetime (date, 'unixepoch', 'localtime'))
+FROM
+ logs
+#+end_src
+
+#+RESULTS[8f773b86167f2d36db335568f344063f13838a11]:
+| min(date) | min(datetime (date, 'unixepoch')) | min(datetime (date, 'unixepoch', 'localtime')) | max(date) | max(datetime (date, 'unixepoch')) | max(datetime (date, 'unixepoch', 'localtime')) |
+|------------+-----------------------------------+------------------------------------------------+------------+-----------------------------------+------------------------------------------------|
+| 1735686035 | 2024-12-31 23:00:35 | 2025-01-01 00:00:35 | 1745109504 | 2025-04-20 00:38:24 | 2025-04-20 02:38:24 |
+* Who is visiting
+I analyzed the =Apache= log files of my =cgit= service in the period from
+=2025-01-01= till =2025-04-20=. Table [[top-users]] shows the top /users/ of public
+facing git repository. The leading AI companies =OpenAI= and =Anthropic= with
+their respective bots =GPTBot= and =ClaudeBot= simply dominate. I found it
+unbelievable that they could extract about =≈7GiB= of data each. That is a lot
+of Bandwidth out of my server for a few git repositories and in a lightweight
+web interphase.
#+begin_src sqlite :exports results
--SELECT
@@ -67,6 +85,12 @@ LIMIT 10
| 2500 | 53.7 | Mozilla/5.0 (compatible; SeekportBot; +https://bot.seekport.com) |
| 3578 | 30.9 | Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.7049.52 Mobile Safari/537.36 (compatible; GoogleOther) |
+That is the total. What does it look like as a function of time? Figure
+[[fig:agent-traffic]] shows the load on CGit frontend service by each visiting
+agent. Hover over the plot to read the exact value for each agent at a given
+time on the legend. You can highlight a specific curve by hovering over it or
+its legend.
+
#+begin_src sqlite :results value file :file top_agent_traffic.csv :exports none
SELECT
date / 14400 * 14400 AS time, -- 4h bin
@@ -94,58 +118,54 @@ GROUP BY
#+RESULTS[87f78b2de4d43785f81e682925c6b6542d794883]:
[[file:top_agent_traffic.csv]]
-This is reviewed in figure [[fig:agent-traffic]]
-
#+attr_html: :id agent-traffic
-#+CAPTION: Example of the process
+#+CAPTION: Load on CGit frontend service by each visiting agent. The first black dashed line shows the total request at the server and uses the right axis scale. All other solid-filled lines, use the left axis and represent the bandwidth usage.
#+NAME: fig:agent-traffic
[[./jsfill.png]]
-#+caption: Caption shared by both figures
-#+begin_quote
-teusta sat
-#+end_quote
+You can see how aggressively the =ClaudeBot= scrapes pages, consuming a lot of
+bandwidth is a /short/ time. =OpenAI-GPTBot= works show its rate-limitation, and
+performs its scraping over a /longer/ period of time. However, as seen in table
+[[top-users]], it performs more than twice the amount of request consumes =30%= more
+bandwidth.
-following theorem [[th:theproc]]
+The rest of the visitors are bots too. =Barkrowler= is a regular visitor
+gathering metrics for online marketing. =AhrefsBot= is of the same type, yet
+started crawling in March. =Macintosh= is certainly a bot hiding itself as a
+browser and constantly probing. =Scrappy= also unknown, came at the start of the
+year and never came back.
-#+CAPTION: Example of the process
-#+NAME: th:theproc
-#+attr_html: :style display:flex; :id fun
-#+begin_theorem
-If an integer $n$ is greater than 2, then the equation $a^n + b^n = c^n$
-has no solutions in non-zero integers $a$, $b$, and $c$.
-#+end_theorem
+=PetalBot= is for a search engine with AI recommendation by =Huawei=, it lingers
+and slowly scrapes everything. =Seekport= is a search engine, it came all of a
+sudden, took as much as it found useful, =<1%= of what the big AI bot take, and
+swiftly left again.
-look fig [[figblo]]
+=Bytespider= is almost background noise, but it is also to train an LLM, this
+time for ByteDance, the Chinese owner of TikTok.
-#+NAME: figblo
-#+attr_html: :class tusi figblo
-#+CAPTION: fun block
-#+begin_figure
-satore
-#+end_figure
+The last one =Google= doesn't even seem to be the bot for indexing its search
+engine, but rather one to test its chrome browser and how it renders pages.
+=Rest= is all the remaining /robots/ or /users/. The have consumed around
+=≈400MiB=, placing them in aggregate in a behavior like =Macintosh, Scrapy,
+PetalBot & AhrefsBot=.
-#+begin_src sqlite
-SELECT
- round(total (length) / 1024, 2) "tx KiB",
- count(*) hits,
- agentid,
- user_agent
-FROM
- logs
- JOIN agent ON agentid = id
- --where agentid in (
- --select id from agent
-WHERE
- user_agent LIKE '%google%'
- --)
-GROUP BY
- id
-ORDER BY
- 1 DESC
-#+end_src
+* How should they visit?
+
+This is a collection of =git= repositories. Yet, you can browse some of my code,
+some files, that is it. Yet, when you want *everything*, the correct use of this
+service is through the =git= client, downloading my publicly available software.
+That makes the data a lot more useful, even for those AI companies as the data
+cleanup would be easier. They, themselves should use their AI to recognize what
+kind of page they are vising and act accordingly instead of stupidly scraping
+everything.
+
+How have the good citizens behaved? That is on table [[git-users]]. The =Software
+Heritage= keeps mirrors of git repositories. It thus watches for updates and the
+downloads. There are other people besides them that downloaded, but in total, in
+the same time they only downloaded =≈21MiB=. That is =0.3%= compared to
+=ClaudeBot=.
#+begin_src sqlite
SELECT
@@ -164,6 +184,8 @@ GROUP BY
order by total(length) desc
#+end_src
+#+name: git-users
+#+caption: Git users
#+RESULTS[333fcbc738819c497f14b4445a3b45f391f0db7e]:
| User Agent | hits | tx KiB |
|----------------------------------------------------------------------------------+------+---------|
@@ -178,6 +200,73 @@ order by total(length) desc
|----------------------------------------------------------------------------------+------+---------|
| Total | 2695 | 21005.6 |
+* What are they looking at?
+The web front end of git repositories of course, but it there a pattern?
+
+#+begin_src sqlite
+#+end_src
+
+#+RESULTS[5c52f5ea84c4e903cbbbf8dd8f73139532de16dc]:
+
+
+#+begin_src sqlite
+select count(*), total(length), status
+from logs
+where agentid = 4602
+group by status
+#+end_src
+
+#+RESULTS[91700bd4c58675dde008a24f8e07cfcea3966f58]:
+| count(*) | total(length) | status |
+|----------+---------------+--------|
+| 207240 | 497513096.0 | 200 |
+| 348 | 1195624.0 | 404 |
+| 183 | 186683.0 | 500 |
+#+begin_src sqlite
+select count(distinct ipid)
+, count(distinct agentid)
+from logs
+where
+agentid not in (143, 1, 19, 6, 4602, 3, 2, 4, 10306, 9)
+#+end_src
+
+#+RESULTS[efe7c2c9c481b475baccee8d9ed48d814003b23e]:
+| count(distinct ipid) | count(distinct agentid) |
+|----------------------+-------------------------|
+| 35144 | 20648 |
+
+#+begin_src sqlite
+select count(distinct ipid),
+max(date),
+ max(datetime (date, 'unixepoch'))
+from logs
+where agentid = 4602
+#+end_src
+
+#+RESULTS[49b56e0faa552ea7df0d2515250f8f16365a7a40]:
+| count(distinct ipid) | max(date) | max(datetime (date, 'unixepoch')) |
+|----------------------+------------+-----------------------------------|
+| 7 | 1742216884 | 2025-03-17 13:08:04 |
+
+#+begin_src sqlite
+SELECT
+ round(total (length) / 1024, 2) "tx KiB",
+ count(*) hits,
+ agentid,
+ user_agent
+FROM
+ logs
+ JOIN agent ON agentid = id
+ --where agentid in (
+ --select id from agent
+WHERE
+ user_agent LIKE '%google%'
+ --)
+GROUP BY
+ id
+ORDER BY
+ 1 DESC
+#+end_src