diff options
Diffstat (limited to 'webstats/workforrobots.org')
-rw-r--r-- | webstats/workforrobots.org | 526 |
1 files changed, 0 insertions, 526 deletions
diff --git a/webstats/workforrobots.org b/webstats/workforrobots.org deleted file mode 100644 index 08f124b..0000000 --- a/webstats/workforrobots.org +++ /dev/null @@ -1,526 +0,0 @@ -#+title: Work for robots -#+PROPERTY: header-args:sqlite :db /scratch/titan/apache2/cgit-logs.sqlite :colnames yes -#+PROPERTY: header-args :exports both :cache yes :eval no-export -#+HTML_HEAD: <script src="http://127.0.0.1:8095/skewer"></script> -#+HTML_HEAD: <link rel="stylesheet" href="static/uPlot.min.css" /> -#+HTML_HEAD_EXTRA: <script src="static/uPlot.iife.min.js"></script> - -I self-host some of my git repositories to keep sovereignty and independence -from large Internet corporations. Public facing repositories are for everybody, -and today that means for robots. Robots are the main consumers of my work. With -the =AI-hype=, I wanted to have a look at what are those AI companies collecting -from my work. It is worse than everything, it is idiotically everything. They -can't recognize, that they are parsing git repositories and use the appropriate -way of downloading them. - -#+begin_src sqlite :exports none -SELECT - min(date), - min(datetime (date, 'unixepoch')), - min(datetime (date, 'unixepoch', 'localtime')), - max(date), - max(datetime (date, 'unixepoch')), - max(datetime (date, 'unixepoch', 'localtime')) -FROM - logs -#+end_src - -#+RESULTS[8f773b86167f2d36db335568f344063f13838a11]: -| min(date) | min(datetime (date, 'unixepoch')) | min(datetime (date, 'unixepoch', 'localtime')) | max(date) | max(datetime (date, 'unixepoch')) | max(datetime (date, 'unixepoch', 'localtime')) | -|------------+-----------------------------------+------------------------------------------------+------------+-----------------------------------+------------------------------------------------| -| 1735686035 | 2024-12-31 23:00:35 | 2025-01-01 00:00:35 | 1745109504 | 2025-04-20 00:38:24 | 2025-04-20 02:38:24 | -* Who is visiting -I analyzed the =Apache= log files of my =cgit= service in the period from -=2025-01-01= till =2025-04-20=. Table [[top-users]] shows the top /users/ of my -public facing git repository. The leading AI companies =OpenAI= and =Anthropic= -with their respective bots =GPTBot= and =ClaudeBot= simply dominate the load on -the service. I found it unbelievable that they could extract about =≈7GiB= of -data each. That is a lot of Bandwidth out of my server for a few git -repositories and in a lightweight web interface. - -#+begin_src sqlite :exports results ---SELECT - --count(*) AS Requests, - --round(total (length) / 1024 / 1024, 1) "Tx MiB", - ----0, - --'Everybody else' AS "User Agent" ---FROM - --logs ---WHERE - --agentid NOT IN (143, 1, 19, 6, 4602, 3, 2, 4, 10306, 9) - --AND path NOT LIKE '/ingrid/%' ---UNION -SELECT - count(*) AS Requests, - round(total (length) / 1024 / 1024, 1) "Tx MiB", - --agentid, - user_agent AS "User Agent" -FROM - logs - JOIN agent ON agent.id = logs.agentid -WHERE - path NOT LIKE '/ingrid/%' -GROUP BY - agentid -ORDER BY - 2 DESC -LIMIT 10 -#+end_src - -#+name: top-users -this is confusing how can I rewrite this caption -#+caption: Top 10 /users/ ranked by bandwidth usage (/Tx/). /User Agent/ user agent is how they self-identify themselves. -#+RESULTS[36d7b647efa39c3af86581279748a2bb53d034f3]: -| Requests | Tx MiB | User Agent | -|----------+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 3572480 | 8819.6 | Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; *GPTBot* /1.2; +https://openai.com/gptbot) | -| 1617262 | 6766.3 | Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; *ClaudeBot* /1.0; +claudebot@anthropic.com) | -| 273968 | 721.4 | Mozilla/5.0 (compatible; *Barkrowler* /0.9; +https://babbar.tech/crawler) | -| 80159 | 498.3 | Mozilla/5.0 (*Macintosh*; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 | -| 207771 | 475.8 | *Scrapy* /2.11.2 (+https://scrapy.org) | -| 69697 | 466.1 | Mozilla/5.0 (Linux; Android 7.0;) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; *PetalBot*;+https://webmaster.petalsearch.com/site/petalbot) | -| 59832 | 416.4 | Mozilla/5.0 (compatible; *AhrefsBot* /7.0; +http://ahrefs.com/robot/) | -| 14142 | 83.3 | Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; *Bytespider*; spider-feedback@bytedance.com) | -| 2500 | 53.7 | Mozilla/5.0 (compatible; *SeekportBot*; +https://bot.seekport.com) | -| 3578 | 30.9 | Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.7049.52 Mobile Safari/537.36 (compatible; *Google* Other) | - -What does it look like as a function of time? Figure [[fig:agent-traffic]] shows the -load on CGit frontend service by each visiting agent over time. Hover over the -plot to read the exact value for each agent at a given time on the legend. You -can highlight a specific curve by hovering over it or its legend. You can toggle -the display of a curve by clicking on its legend. - -#+begin_src sqlite :results value file :file top_agent_traffic.csv :exports none -SELECT - date / 14400 * 14400 AS time, -- 4h bin - count(*) AS requests, - total (length) FILTER (WHERE agentid = 143) AS "OpenAI-GPTBot", - total (length) FILTER (WHERE agentid = 1) AS "ClaudeBot", - total (length) FILTER (WHERE agentid = 19) AS "Barkrowler", - total (length) FILTER (WHERE agentid = 6) AS "Macintosh", - total (length) FILTER (WHERE agentid = 4602) AS "Scrapy", - total (length) FILTER (WHERE agentid = 3) AS "PetalBot", - total (length) FILTER (WHERE agentid = 2) AS "AhrefsBot", - total (length) FILTER (WHERE agentid = 4) AS "Bytespider", - total (length) FILTER (WHERE agentid = 10306) AS "SeekportBot", - total (length) FILTER (WHERE agentid = 9) AS "Google", - total (length) FILTER (WHERE agentid not in (143, 1, 19, 6, 4602, 3, 2, 4, 10306, 9)) AS "Rest" -FROM - logs -WHERE - path NOT LIKE '/ingrid/%' - AND date NOT NULL -GROUP BY - time -#+end_src - -#+RESULTS[2ec6d40ab4f5a844bdbd855884f8d0a6346fd780]: -[[file:top_agent_traffic.csv]] - -#+attr_html: :id agent-traffic -#+CAPTION: Load on CGit frontend service by each visiting agent. The black dashed line shows the total request at the server and uses the right axis scale. All other solid-filled lines, use the left axis and represent the bandwidth usage. -#+NAME: fig:agent-traffic -[[./jsfill.png]] - -This is confusing and hard to read. Rewrite it. - -You can see how aggressively the =ClaudeBot= scrapes pages, using a lot of -bandwidth is a /short/ time. On the other hand =OpenAI-GPTBot= seems rate -limited, because it scrapes over a /longer/ period of time. However, as seen in -table [[top-users]], it performs more than twice the amount of request and consumes -=30%= more bandwidth. - -The rest of the visitors are bots too. =Barkrowler= is a regular visitor -gathering metrics for online marketing. =AhrefsBot= is of the same type, yet -started crawling in March. =Macintosh= is certainly a bot hiding itself as a -browser and constantly probing. =Scrappy= also unknown, came at the start of the -year and never came back. - -=PetalBot= is for a search engine with AI recommendation by =Huawei=, it lingers -and slowly scrapes everything. =Seekport= is a search engine, it came all of a -sudden, took as much as it found useful, =<1%= of what the big AI bot take, and -swiftly left again. - -=Bytespider= is almost background noise, but it is also to train an LLM, this -time for =ByteDance=, the Chinese owner of =TikTok=. - -The last one =Google= doesn't even seem to be the bot for indexing its search -engine, but rather one to test its =Chrome= browser and how it renders pages. - -=Rest= is all the remaining /robots/ or /users/. The have consumed around -=≈400MiB=, placing them in aggregate in a behavior like =Macintosh, Scrapy, -PetalBot & AhrefsBot=. Mostly often is hacker bots proving the site. Which also -means that =~400MiB= is what you need to crawl the site. AI crawlers siphoning -*10X* that amount is abusive. - -* How should they visit? -=CGit= is a web interface for =git= repositories. You can browse some of my -code, some files, that is it. If you want *everything*, the correct use of this -service is through the =git= client, downloading my publicly available software. - -That makes the data a lot more useful, even for those AI companies. Because the -data cleanup would be easier. They, themselves should use their AI to recognize -what kind of page they are vising and act accordingly instead of stupidly -scraping everything. - -How have the good citizens behaved? That is on table [[git-users]]. The =Software -Heritage= keeps mirrors of git repositories. It thus watches for updates and the -downloads. There are other people besides them that downloaded, but in total -they only downloaded =≈21MiB=. That is =0.3%= compared to =ClaudeBot=. - -#+begin_src sqlite :exports results -SELECT - --agent.id, - user_agent as "User Agent", - count(*) AS hits, - round(total (length) / 1024, 1) AS "tx KiB" -FROM - logs - JOIN agent ON logs.agentid = agent.id -WHERE - user_agent LIKE '%git%' - and id != 10897 -GROUP BY - user_agent -order by total(length) desc -#+end_src - -#+name: git-users -#+caption: Git users -#+RESULTS[333fcbc738819c497f14b4445a3b45f391f0db7e]: -| User Agent | hits | tx KiB | -|----------------------------------------------------------------------------------+------+---------| -| git/2.40.3 | 1075 | 12821.3 | -| git/2.34.1 | 1149 | 3687.1 | -| Software Heritage dumb Git loader | 337 | 2533.6 | -| git/2.48.1 | 115 | 1908.6 | -| Software Heritage cgit lister v6.9.3 (+https://www.softwareheritage.org/contact) | 8 | 21.3 | -| Software Heritage cgit lister v6.9.2 (+https://www.softwareheritage.org/contact) | 8 | 21.0 | -| git/dulwich/0.22.7 | 2 | 8.5 | -| git/dulwich/0.22.6 | 1 | 4.2 | -|----------------------------------------------------------------------------------+------+---------| -| Total | 2695 | 21005.6 | - -* What are they looking at? -The web front end of git repositories of course, but it there a pattern? - -Table [[status-codes]] show the status codes of all requests performed by the users. -The failure rate of =OpenAI= is alarming. From its =3.5 million= requests =15%= -are client errors: =404= not found page error, and it consumes about =≈2GiB= of -bandwidth. What is their scraper doing so wrong? =ClaudeBot= as noted earlier, -manages to scrape with half the requests and an error rate of =1.6%=. - -=Everybody else= are all the remaining users. They do have an error rate of -=25%=, but that is normal as they are generally hacker robots scanning for -vulnerabilities. You are always under attack on the internet. - -#+begin_src sqlite :exports results -SELECT - CASE WHEN agentid = 143 THEN - 'OpenAI-GPTBot' - WHEN agentid = 1 THEN - 'ClaudeBot' - WHEN agentid = 19 THEN - 'Barkrowler' - WHEN agentid = 6 THEN - 'Macintosh' - WHEN agentid = 4602 THEN - 'Scrapy' - WHEN agentid = 3 THEN - 'PetalBot' - WHEN agentid = 2 THEN - 'AhrefsBot' - WHEN agentid = 4 THEN - 'Bytespider' - WHEN agentid = 10306 THEN - 'SeekportBot' - WHEN agentid = 9 THEN - 'Google' - ELSE - 'Everybody else' - END AS Agent, - --count(*) AS "Requests", - count(*) FILTER (WHERE status BETWEEN 200 AND 299) AS "2XX", - count(*) FILTER (WHERE status BETWEEN 300 AND 399) AS "3XX", - count(*) FILTER (WHERE status BETWEEN 400 AND 499) AS "4XX", - count(*) FILTER (WHERE status BETWEEN 500 AND 599) AS "5XX", - round((total(length) FILTER (WHERE status BETWEEN 400 AND 499))/1024/1024, 2) AS "4XX MiB", - round((100.0 * count(*) FILTER (WHERE status BETWEEN 400 AND 499)) / count(*), 2) AS "4XX %" -FROM - logs -GROUP BY - Agent -ORDER BY - "4XX" DESC -#+end_src - -#+name: status-codes -#+caption: HTTP status codes per user agent -#+RESULTS[b4402559f97ad9a4f1ec20091284651af575ffeb]: -| Agent | 2XX | 3XX | 4XX | 5XX | 4XX MiB | 4XX % | -|----------------+---------+-----+--------+-----+---------+-------| -| / | < | | | | < | | -| OpenAI-GPTBot | 3017848 | 0 | 554511 | 121 | 2060.23 | 15.52 | -| Everybody else | 99066 | 467 | 34630 | 14 | 101.96 | 25.81 | -| ClaudeBot | 1591179 | 26 | 25611 | 446 | 162.67 | 1.58 | -| Barkrowler | 272343 | 0 | 1618 | 7 | 5.35 | 0.59 | -| Macintosh | 79071 | 2 | 1086 | 0 | 7.87 | 1.35 | -| Bytespider | 13609 | 0 | 531 | 2 | 3.94 | 3.75 | -| PetalBot | 69223 | 0 | 473 | 1 | 3.2 | 0.68 | -| Scrapy | 207240 | 0 | 348 | 183 | 1.14 | 0.17 | -| AhrefsBot | 59733 | 0 | 90 | 9 | 0.61 | 0.15 | -| Google | 3576 | 0 | 2 | 0 | 0.02 | 0.06 | -| SeekportBot | 2500 | 0 | 0 | 0 | 0.0 | 0.0 | - -Let's have a look at the most not found pages. Listed in table [[fail-pages]] are -each of the page paths, how much bandwidth(=tx=) each consumed, and then the -requests per bot. With one exception, all pages are placeholder links used in -website theme templates. The repository =hugo-minimalist-theme= is a [[https://gohugo.io][Hugo]] theme. -Within the curly braces ={{ }}= the rendering engine replaces values. Certainly -the html parser reads them raw an from the link =a= tag and requests the page. -=ClaudeBot= seems to track error pages and not query them again. =OpenAI= is -incapable of doing that, and stubbornly tries over and over. - -If you grep for the string /href="{{ .RelPermalink }}"/ over the entire git -history of that repository, you find it appears up to today =954= times. It is -surprising and annoying how =OpenAI= manages to request it triple that amount. - -#+begin_src sqlite :exports results -SELECT - replace(replace(replace(replace(path, '%7B', '{'), '%7D', '}'), '|', '\vert'), '%20', ' ') AS Page, - round(total (length) / 1024 / 1024, 2) AS "tx MiB", - count(*) FILTER (WHERE agentid = 143) AS "OpenAI", - count(*) FILTER (WHERE agentid = 1) AS "ClaudeBot", - count(*) FILTER (WHERE agentid NOT IN (1, 143)) AS "Rest" - --substr(path, 0, 50) -FROM - logs -WHERE - path NOT LIKE '/ingrid/%' - AND status = 404 -GROUP BY - path -ORDER BY - 2 DESC -LIMIT 10 -#+end_src - -#+name: fail-pages -#+caption: Top 10: =404= error not found pages. -#+RESULTS[19605bfdef59599b47ed8f0e4b3bce71daaca7d3]: -| Page | tx MiB | OpenAI | ClaudeBot | Rest | -|---------------------------------------------------------------------------+--------+--------+-----------+------| -| /hugo-minimalist-theme/plain/layouts/partials/{{ .RelPermalink }} | 8.36 | 2805 | 3 | 7 | -| /hugo-minimalist-theme/plain/layouts/partials/{{ .URL }} | 5.39 | 1629 | 1 | 13 | -| /hugo-minimalist-theme/plain/layouts/partials/{{ . }} | 4.82 | 1559 | 1 | 4 | -| /hugo-minimalist-theme/plain/layouts/partials/{{ $href }} | 4.28 | 1209 | 4 | 5 | -| /.env | 3.84 | 0 | 0 | 744 | -| /hugo-minimalist-theme/plain/layouts/partials/{{ .Permalink }} | 3.75 | 1060 | 2 | 15 | -| /hugo-minimalist-theme/plain/layouts/partials/{{ $pag.Next.URL }} | 3.36 | 916 | 1 | 7 | -| /hugo-minimalist-theme/plain/layouts/partials/{{ $pag.Prev.URL }} | 3.34 | 912 | 0 | 7 | -| /hugo-minimalist-theme/plain/layouts/partials/{{ if ne .MediaType.SubType | 2.95 | 817 | 1 | 0 | -| /hugo-minimalist-theme/plain/layouts/taxonomy/{{ .Name \vert urlize }} | 2.86 | 745 | 5 | 0 | - -What about the hackers? Table [[hacker-attacks]] excludes the AI bots to look for -the attack surface. First is producing a =400 Bad Request= to the main site. -Trying to steal/find environment secrets under the =.env= file, or the git -configuration. Then the most common type of attack is aims to exploit the remote -code execution in =PHPUnit= by looking for the file =eval-stdin=. - -#+begin_src sqlite :exports results -SELECT - round(total (length) / 1024 / 1024, 2) AS "Tx MiB", - count(*) Requests, - count(DISTINCT agentid) Agents, - count(DISTINCT ipid) IPs, - group_concat (DISTINCT status) AS "Errors", - group_concat (DISTINCT request_method) AS "Methods", - replace(path, '_', '\under{}') AS path -FROM - logs -WHERE - path NOT LIKE '/ingrid/%' - AND status >= 300 - AND agentid NOT IN (1, 143) -GROUP BY - path - --, status -ORDER BY - --sum(count(*)) OVER (PARTITION BY path) DESC, - Requests DESC -LIMIT 10 -#+end_src - -#+name: hacker-attacks -#+caption: Top 10 attacks leading to error pages ranked by number of requests. Agents and IPs columns count different agents and IPs doing the requests. -#+RESULTS[5d9125d3f545af74d94789c0893d257619926437]: -| Tx MiB | Requests | Agents | IPs | Errors | Methods | path | -|--------+----------+--------+------+-------------+----------+----------------------------------------------------------------------------------------------------------| -| 3.17 | 3482 | 6 | 1139 | 400,421,408 | GET,POST | / | -| 3.84 | 744 | 368 | 256 | 404 | GET,POST | /.env | -| 2.26 | 409 | 1 | 11 | 404 | GET | /cgi-bin/luci/;stok=/locale | -| 2.02 | 381 | 182 | 121 | 404 | GET | /.git/config | -| 0.57 | 222 | 12 | 167 | 404 | GET,POST | /vendor/phpunit/phpunit/src/Util/PHP/eval-stdin.php | -| 1.08 | 195 | 1 | 1 | 404 | GET | /actuator/gateway/routes | -| 0.88 | 173 | 2 | 137 | 404 | POST | /hello.world?%ADd+allow\under{}url\under{}include%3d1+%ADd+auto\under{}prepend\under{}file%3dphp://input | -| 0.23 | 157 | 2 | 127 | 404 | GET | /vendor/phpunit/phpunit/Util/PHP/eval-stdin.php | -| 0.22 | 152 | 2 | 123 | 404 | GET | /vendor/phpunit/src/Util/PHP/eval-stdin.php | -| 0.22 | 148 | 2 | 119 | 404 | GET | /vendor/phpunit/phpunit/LICENSE/eval-stdin.php | - -* Future plans -Quite many webmasters have been annoyed by this abusive scraping of AI bots. The -project [[https://xeiaso.net/blog/2025/anubis/][Anubis]] implements a proof of work /tax/ to visitors of a webpage. This -way abusive AI bots scraping reduced. - -I personally dislike that idea, it does create an extra expense that the AI -companies, which indiscriminately crawl the internet. But no one really wins. It -is a failure of our internet system, that micro payments aren't yet a reality. -This means for myself, being part of the change and bring my bitcoin lightning -tipping system back online, this time with real coins. We need to get people -used to pay for resources on the internet. For that we need a working -infrastructure, we can't wait for the banking system to do it. In my opinion, -the main reason why our internet so aggressively invades our privacy is because -the banking system never provided a way to move money across the internet. The -only people that could pay were advertising companies. - -Knowing how stupid the AI crawlers are, I believe poisoning the training data is -better than using a Proof of work tax to cure AI companies such aggressive and -mindless crawling. Projects like [[https://iocaine.madhouse-project.org/][Iocane]] provide a way for it and is what I'll -implement in the future. - -#+begin_export html -<script type="text/javascript"> - addEventListener("load", () => { - function csvFloat(data) { - let headers = data[0]; - let series = headers.map((_, idx) => - data.slice(1).map((row) => parseFloat(row[idx])), - ); - return [headers, series]; - } - function responseParseCSV(response) { - if (response.ok) - return response.text().then((data) => - data - .split(/\n/) - .filter((x) => x) - .map((row) => row.split(/,/)), - ); - throw new Error("not 2XX resp"); - } - function withSuffix(val, suffix) { - return val.toFixed(1).replace(/.?0+$/, "").concat("", suffix); - } - function siScaling(value) { - var v = Math.abs(value); - return 0 === v - ? [0, ""] - : v >= 1000000000000000.0 - ? [value / 1000000000000000.0, "P"] - : v >= 1000000000000.0 - ? [value / 1000000000000.0, "T"] - : v >= 1000000000.0 - ? [value / 1000000000.0, "G"] - : v >= 1000000.0 - ? [value / 1000000.0, "M"] - : v >= 1000.0 - ? [value / 1000.0, "K"] - : v >= 0.6 - ? [value, ""] - : v >= 0.001 - ? [value / 0.001, "m"] - : v >= 0.000001 - ? [value / 0.000001, "μ"] - : v >= 0.000000001 - ? [value / 0.000000001, "n"] - : v >= 0.000000000001 - ? [value / 0.000000000001, "p"] - : null; - } - function scaling(val, suffix) { - return withSuffix.apply(this, siScaling(val)); - } - - function spacedColor(idx, alpha) { - if (alpha === undefined) { - alpha = "/ 1"; - } - return "hsl(" + 137.506 * idx + " 70% 55% " + alpha + ")"; - } - function agentChart(header, series, container) { - const opts = { - width: 920, - height: 600, - hooks: { - setSeries: [ - (u, seriesIdx, opts) => { - if (opts.focus != null) { - u.series.forEach((s, i) => { - s.width = i == seriesIdx ? 3 : 1; - }); - } - }, - ], - }, - focus: { alpha: 0.5 }, - cursor: { - focus: { - prox: 1e6, - bias: 0, - dist: (self, seriesIdx, dataIdx, valPos, curPos) => { - return valPos - curPos; - }, - }, - }, - - series: [ - {}, - { - label: header[1], - stroke: "black", - dash: [10, 5], - value: (u, v) => (v ? scaling(v) : v), - scale: "hits", - }, - ].concat( - header.slice(2).map((name, idx) => ({ - label: name, - stroke: spacedColor(idx), - fill: spacedColor(idx, "/ 0.1"), - value: (u, v) => (v ? scaling(v) + "B" : v), - })), - ), - axes: [ - {}, - { - values: (u, vals, space) => - vals.map((v) => (v ? scaling(v) + "B" : v)), - size: 60, - label: "Bandwidth", - labelSize: 50, - }, - { - side: 1, - scale: "hits", - label: "Requests", - grid: { show: false }, - values: (u, vals, space) => vals.map((v) => (v ? scaling(v) : v)), - labelSize: 50, - }, - ], - }; - let uplot = new uPlot(opts, series, container); - container["uobj"] = uplot; - } - - fetch("/top_agent_traffic.csv") - .then(responseParseCSV) - .then(csvFloat) - .then(([headers, series]) => { - let cont = document.querySelector("#agent-traffic").parentNode; - cont.innerHTML = ""; - agentChart(headers, series, cont); - }); - }); -</script> -#+end_export |