From 69111ed741338ff940bbd598388fbbd2519cd0b9 Mon Sep 17 00:00:00 2001 From: Oscar Najera Date: Mon, 28 Apr 2025 00:28:13 +0200 Subject: robots visits --- webstats/logparse.lisp | 3 +- webstats/workforrobots.org | 201 ++++++++++++++++++++++++++++++++------------- 2 files changed, 146 insertions(+), 58 deletions(-) diff --git a/webstats/logparse.lisp b/webstats/logparse.lisp index 71864d8..b7e0577 100644 --- a/webstats/logparse.lisp +++ b/webstats/logparse.lisp @@ -26,7 +26,7 @@ (ppcre:register-groups-bind ((#'parse-integer date) month (#'parse-integer year hour min sec)) ("(\\d{2})/(\\w{3})/(\\d{4}):(\\d{2}):(\\d{2}):(\\d{2})" timestr) (- - (encode-universal-time sec min hour date (gethash month +short-month-names+) year) + (encode-universal-time sec min hour date (gethash month +short-month-names+) year 0) #.(encode-universal-time 0 0 0 1 1 1970 0)))) (= (time-to-posix "02/Mar/2025:00:00:16 +0000") 1740870016) @@ -37,7 +37,6 @@ "20.171.207.185 - - [02/Mar/2025:00:00:16 +0000] \"GET /pub/hi/scratch/plain/AoC2023/day02/input?h=simpler&id=e2f5b6c2d5bb67013ba1b612252781e4cd9b6fe1 HTTP/1.1\" 200 7017 \"-\" \"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.2; +https://openai.com/gptbot)\"") (list ip remote-log userid date method path version status length referrer agent)) - (defvar *sqlite* (sqlite:connect "/scratch/titan/apache2/cgit-logs.sqlite")) (sqlite:disconnect *sqlite*) diff --git a/webstats/workforrobots.org b/webstats/workforrobots.org index 2c8c542..76e7ddd 100644 --- a/webstats/workforrobots.org +++ b/webstats/workforrobots.org @@ -6,21 +6,39 @@ #+HTML_HEAD_EXTRA: #+HTML_HEAD_EXTRA: -I self-host my website to keep sovereignty and independence from large Internet -corporations. I don't have at the moment any analytics tool on this site. -However, some of my git repositories are also self-hosted, and there I have -access to the =Apache= logs. With all the =AI-hype= and coding assistants, I -wanted to have a look at how of my =software= are those AI companies taking. - -Analyzing my log files in the period from =2024-12-31 23:00:35= till =2025-04-20 -00:38:24= I'm surprised to see how dumb those AI companies are in the use of -their web crawlers. - -Table [[top-users]] shows the top /users/ of my git repository. The leading AI -companies =OpenAI= and =Anthropic= with their respective bots =GPTBot= and -=ClaudeBot= simply dominate. I found it unbelievable that they could extract -about =≈7GiB= of data. That is a lot of Bandwidth out of my server for a few git -repositories and in a lightweight web interphase. + +I self-host some of my git repositories to keep sovereignty and independence +from large Internet corporations. The public facing repositories are for +everybody, and today that means for robots. With the =AI-hype= on coding +assistants, I wanted to have a look at what are those AI companies taking. It is +worse than everything, it is idiotically everything. They can't recognize that +they are parsing git repositories and use the appropriate way of downloading +them. + +#+begin_src sqlite :exports none +SELECT + min(date), + min(datetime (date, 'unixepoch')), + min(datetime (date, 'unixepoch', 'localtime')), + max(date), + max(datetime (date, 'unixepoch')), + max(datetime (date, 'unixepoch', 'localtime')) +FROM + logs +#+end_src + +#+RESULTS[8f773b86167f2d36db335568f344063f13838a11]: +| min(date) | min(datetime (date, 'unixepoch')) | min(datetime (date, 'unixepoch', 'localtime')) | max(date) | max(datetime (date, 'unixepoch')) | max(datetime (date, 'unixepoch', 'localtime')) | +|------------+-----------------------------------+------------------------------------------------+------------+-----------------------------------+------------------------------------------------| +| 1735686035 | 2024-12-31 23:00:35 | 2025-01-01 00:00:35 | 1745109504 | 2025-04-20 00:38:24 | 2025-04-20 02:38:24 | +* Who is visiting +I analyzed the =Apache= log files of my =cgit= service in the period from +=2025-01-01= till =2025-04-20=. Table [[top-users]] shows the top /users/ of public +facing git repository. The leading AI companies =OpenAI= and =Anthropic= with +their respective bots =GPTBot= and =ClaudeBot= simply dominate. I found it +unbelievable that they could extract about =≈7GiB= of data each. That is a lot +of Bandwidth out of my server for a few git repositories and in a lightweight +web interphase. #+begin_src sqlite :exports results --SELECT @@ -67,6 +85,12 @@ LIMIT 10 | 2500 | 53.7 | Mozilla/5.0 (compatible; SeekportBot; +https://bot.seekport.com) | | 3578 | 30.9 | Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.7049.52 Mobile Safari/537.36 (compatible; GoogleOther) | +That is the total. What does it look like as a function of time? Figure +[[fig:agent-traffic]] shows the load on CGit frontend service by each visiting +agent. Hover over the plot to read the exact value for each agent at a given +time on the legend. You can highlight a specific curve by hovering over it or +its legend. + #+begin_src sqlite :results value file :file top_agent_traffic.csv :exports none SELECT date / 14400 * 14400 AS time, -- 4h bin @@ -94,58 +118,54 @@ GROUP BY #+RESULTS[87f78b2de4d43785f81e682925c6b6542d794883]: [[file:top_agent_traffic.csv]] -This is reviewed in figure [[fig:agent-traffic]] - #+attr_html: :id agent-traffic -#+CAPTION: Example of the process +#+CAPTION: Load on CGit frontend service by each visiting agent. The first black dashed line shows the total request at the server and uses the right axis scale. All other solid-filled lines, use the left axis and represent the bandwidth usage. #+NAME: fig:agent-traffic [[./jsfill.png]] -#+caption: Caption shared by both figures -#+begin_quote -teusta sat -#+end_quote +You can see how aggressively the =ClaudeBot= scrapes pages, consuming a lot of +bandwidth is a /short/ time. =OpenAI-GPTBot= works show its rate-limitation, and +performs its scraping over a /longer/ period of time. However, as seen in table +[[top-users]], it performs more than twice the amount of request consumes =30%= more +bandwidth. -following theorem [[th:theproc]] +The rest of the visitors are bots too. =Barkrowler= is a regular visitor +gathering metrics for online marketing. =AhrefsBot= is of the same type, yet +started crawling in March. =Macintosh= is certainly a bot hiding itself as a +browser and constantly probing. =Scrappy= also unknown, came at the start of the +year and never came back. -#+CAPTION: Example of the process -#+NAME: th:theproc -#+attr_html: :style display:flex; :id fun -#+begin_theorem -If an integer $n$ is greater than 2, then the equation $a^n + b^n = c^n$ -has no solutions in non-zero integers $a$, $b$, and $c$. -#+end_theorem +=PetalBot= is for a search engine with AI recommendation by =Huawei=, it lingers +and slowly scrapes everything. =Seekport= is a search engine, it came all of a +sudden, took as much as it found useful, =<1%= of what the big AI bot take, and +swiftly left again. -look fig [[figblo]] +=Bytespider= is almost background noise, but it is also to train an LLM, this +time for ByteDance, the Chinese owner of TikTok. -#+NAME: figblo -#+attr_html: :class tusi figblo -#+CAPTION: fun block -#+begin_figure -satore -#+end_figure +The last one =Google= doesn't even seem to be the bot for indexing its search +engine, but rather one to test its chrome browser and how it renders pages. +=Rest= is all the remaining /robots/ or /users/. The have consumed around +=≈400MiB=, placing them in aggregate in a behavior like =Macintosh, Scrapy, +PetalBot & AhrefsBot=. -#+begin_src sqlite -SELECT - round(total (length) / 1024, 2) "tx KiB", - count(*) hits, - agentid, - user_agent -FROM - logs - JOIN agent ON agentid = id - --where agentid in ( - --select id from agent -WHERE - user_agent LIKE '%google%' - --) -GROUP BY - id -ORDER BY - 1 DESC -#+end_src +* How should they visit? + +This is a collection of =git= repositories. Yet, you can browse some of my code, +some files, that is it. Yet, when you want *everything*, the correct use of this +service is through the =git= client, downloading my publicly available software. +That makes the data a lot more useful, even for those AI companies as the data +cleanup would be easier. They, themselves should use their AI to recognize what +kind of page they are vising and act accordingly instead of stupidly scraping +everything. + +How have the good citizens behaved? That is on table [[git-users]]. The =Software +Heritage= keeps mirrors of git repositories. It thus watches for updates and the +downloads. There are other people besides them that downloaded, but in total, in +the same time they only downloaded =≈21MiB=. That is =0.3%= compared to +=ClaudeBot=. #+begin_src sqlite SELECT @@ -164,6 +184,8 @@ GROUP BY order by total(length) desc #+end_src +#+name: git-users +#+caption: Git users #+RESULTS[333fcbc738819c497f14b4445a3b45f391f0db7e]: | User Agent | hits | tx KiB | |----------------------------------------------------------------------------------+------+---------| @@ -178,6 +200,73 @@ order by total(length) desc |----------------------------------------------------------------------------------+------+---------| | Total | 2695 | 21005.6 | +* What are they looking at? +The web front end of git repositories of course, but it there a pattern? + +#+begin_src sqlite +#+end_src + +#+RESULTS[5c52f5ea84c4e903cbbbf8dd8f73139532de16dc]: + + +#+begin_src sqlite +select count(*), total(length), status +from logs +where agentid = 4602 +group by status +#+end_src + +#+RESULTS[91700bd4c58675dde008a24f8e07cfcea3966f58]: +| count(*) | total(length) | status | +|----------+---------------+--------| +| 207240 | 497513096.0 | 200 | +| 348 | 1195624.0 | 404 | +| 183 | 186683.0 | 500 | +#+begin_src sqlite +select count(distinct ipid) +, count(distinct agentid) +from logs +where +agentid not in (143, 1, 19, 6, 4602, 3, 2, 4, 10306, 9) +#+end_src + +#+RESULTS[efe7c2c9c481b475baccee8d9ed48d814003b23e]: +| count(distinct ipid) | count(distinct agentid) | +|----------------------+-------------------------| +| 35144 | 20648 | + +#+begin_src sqlite +select count(distinct ipid), +max(date), + max(datetime (date, 'unixepoch')) +from logs +where agentid = 4602 +#+end_src + +#+RESULTS[49b56e0faa552ea7df0d2515250f8f16365a7a40]: +| count(distinct ipid) | max(date) | max(datetime (date, 'unixepoch')) | +|----------------------+------------+-----------------------------------| +| 7 | 1742216884 | 2025-03-17 13:08:04 | + +#+begin_src sqlite +SELECT + round(total (length) / 1024, 2) "tx KiB", + count(*) hits, + agentid, + user_agent +FROM + logs + JOIN agent ON agentid = id + --where agentid in ( + --select id from agent +WHERE + user_agent LIKE '%google%' + --) +GROUP BY + id +ORDER BY + 1 DESC +#+end_src -- cgit v1.2.3