aboutsummaryrefslogtreecommitdiffstats
path: root/webstats/workforrobots.org
diff options
context:
space:
mode:
authorOscar Najera <hi@oscarnajera.com>2025-04-28 03:04:31 +0200
committerOscar Najera <hi@oscarnajera.com>2025-04-28 03:04:31 +0200
commitd23c5887921a4af9982aadf8a09141ef226ce573 (patch)
treedd1a318d00cc3664cb1e929b7e55fa9ad8fe7892 /webstats/workforrobots.org
parent69111ed741338ff940bbd598388fbbd2519cd0b9 (diff)
downloadscratch-d23c5887921a4af9982aadf8a09141ef226ce573.tar.gz
scratch-d23c5887921a4af9982aadf8a09141ef226ce573.tar.bz2
scratch-d23c5887921a4af9982aadf8a09141ef226ce573.zip
fails
Diffstat (limited to 'webstats/workforrobots.org')
-rw-r--r--webstats/workforrobots.org356
1 files changed, 219 insertions, 137 deletions
diff --git a/webstats/workforrobots.org b/webstats/workforrobots.org
index 76e7ddd..8cfeafe 100644
--- a/webstats/workforrobots.org
+++ b/webstats/workforrobots.org
@@ -167,7 +167,7 @@ downloads. There are other people besides them that downloaded, but in total, in
the same time they only downloaded =≈21MiB=. That is =0.3%= compared to
=ClaudeBot=.
-#+begin_src sqlite
+#+begin_src sqlite :exports results
SELECT
--agent.id,
user_agent as "User Agent",
@@ -203,174 +203,168 @@ order by total(length) desc
* What are they looking at?
The web front end of git repositories of course, but it there a pattern?
-#+begin_src sqlite
-#+end_src
-
-#+RESULTS[5c52f5ea84c4e903cbbbf8dd8f73139532de16dc]:
+Table [[status-codes]] show the status codes of all requests performed by the users.
+The failure rate of =OpenAI= is alarming. It from its =3.5 million= requests
+=15%= are client errors, mostly not found pages of error =404=. What is their
+scraper doing so wrong? =ClaudeBot= as noted earlier, manages to scrape with
+half the requests and a failure rate of =1.6%=.
+=Everybody else= are the =Rest=, all users that can't be grouped into the major
+users. Yet they have a failure rate of =25%=, but that is normal as they are
+mostly hacker robots scanning for vulnerabilities.
-#+begin_src sqlite
-select count(*), total(length), status
-from logs
-where agentid = 4602
-group by status
-#+end_src
-
-#+RESULTS[91700bd4c58675dde008a24f8e07cfcea3966f58]:
-| count(*) | total(length) | status |
-|----------+---------------+--------|
-| 207240 | 497513096.0 | 200 |
-| 348 | 1195624.0 | 404 |
-| 183 | 186683.0 | 500 |
-#+begin_src sqlite
-select count(distinct ipid)
-, count(distinct agentid)
-from logs
-where
-agentid not in (143, 1, 19, 6, 4602, 3, 2, 4, 10306, 9)
-#+end_src
-
-#+RESULTS[efe7c2c9c481b475baccee8d9ed48d814003b23e]:
-| count(distinct ipid) | count(distinct agentid) |
-|----------------------+-------------------------|
-| 35144 | 20648 |
-
-#+begin_src sqlite
-select count(distinct ipid),
-max(date),
- max(datetime (date, 'unixepoch'))
-from logs
-where agentid = 4602
-#+end_src
-
-#+RESULTS[49b56e0faa552ea7df0d2515250f8f16365a7a40]:
-| count(distinct ipid) | max(date) | max(datetime (date, 'unixepoch')) |
-|----------------------+------------+-----------------------------------|
-| 7 | 1742216884 | 2025-03-17 13:08:04 |
-
-#+begin_src sqlite
+#+begin_src sqlite :exports results
SELECT
- round(total (length) / 1024, 2) "tx KiB",
- count(*) hits,
- agentid,
- user_agent
+ CASE WHEN agentid = 143 THEN
+ 'OpenAI-GPTBot'
+ WHEN agentid = 1 THEN
+ 'ClaudeBot'
+ WHEN agentid = 19 THEN
+ 'Barkrowler'
+ WHEN agentid = 6 THEN
+ 'Macintosh'
+ WHEN agentid = 4602 THEN
+ 'Scrapy'
+ WHEN agentid = 3 THEN
+ 'PetalBot'
+ WHEN agentid = 2 THEN
+ 'AhrefsBot'
+ WHEN agentid = 4 THEN
+ 'Bytespider'
+ WHEN agentid = 10306 THEN
+ 'SeekportBot'
+ WHEN agentid = 9 THEN
+ 'Google'
+ ELSE
+ 'Everybody else'
+ END AS Agent,
+ --count(*) AS "Requests",
+ count(*) FILTER (WHERE status BETWEEN 200 AND 299) AS "2XX",
+ count(*) FILTER (WHERE status BETWEEN 300 AND 399) AS "3XX",
+ count(*) FILTER (WHERE status BETWEEN 400 AND 499) AS "4XX",
+ --(100.0 * count(*) FILTER (WHERE status BETWEEN 400 AND 499)) / count(*) AS "fails 4XX",
+ count(*) FILTER (WHERE status BETWEEN 500 AND 599) AS "5XX"
FROM
logs
- JOIN agent ON agentid = id
- --where agentid in (
- --select id from agent
-WHERE
- user_agent LIKE '%google%'
- --)
GROUP BY
- id
+ Agent
ORDER BY
- 1 DESC
+ "4XX" DESC
#+end_src
+#+name: status-codes
+#+caption: HTTP status codes per user agent
+#+RESULTS[4939ff0e53fd1b72a7f39a19a3a50483893b07d5]:
+| Agent | 2XX | 3XX | 4XX | fails 4XX | 5XX |
+|----------------+---------+-----+--------+--------------------+-----|
+| OpenAI-GPTBot | 3017848 | 0 | 554511 | 15.521738400215 | 121 |
+| Everybody else | 99066 | 467 | 34630 | 25.8091923354971 | 14 |
+| ClaudeBot | 1591179 | 26 | 25611 | 1.58360240950446 | 446 |
+| Barkrowler | 272343 | 0 | 1618 | 0.590579921742685 | 7 |
+| Macintosh | 79071 | 2 | 1086 | 1.3548073204506 | 0 |
+| Bytespider | 13609 | 0 | 531 | 3.75477301654646 | 2 |
+| PetalBot | 69223 | 0 | 473 | 0.678651878847009 | 1 |
+| Scrapy | 207240 | 0 | 348 | 0.167492094661911 | 183 |
+| AhrefsBot | 59733 | 0 | 90 | 0.150421179302046 | 9 |
+| Google | 3576 | 0 | 2 | 0.0558971492453885 | 0 |
+| SeekportBot | 2500 | 0 | 0 | 0.0 | 0 |
+
+Let's have a look at the most not found pages. Listed in table [[fail-pages]] are
+each of the page paths, how much bandwidth(=tx=) each consumed, and then the
+request per bots. With one exception, all pages are placeholder from the
+template engine. The repository =hugo-minimalist-theme= is a [[https://gohugo.io][Hugo]] theme. Within
+the curly braces ={{ }}= the rendering engine replaces values. Certainly the
+html parser reads them raw an from the link =a= tag and requests the page.
+=ClaudeBot= seems to track error pages and not query them again. =OpenAI= is
+incapable of doing that, and stubbornly tries over and over.
+
+If you grep for the string /href="{{ .RelPermalink }}"/ over the entire git
+history, you find it appears up to today =954= times. It is surprising how
+=OpenAI= manage to request it =3= times that amount.
-#+begin_src sqlite
-SELECT
- status,
- count(*) as hits,
- round(total (length) / 1024 / 1024, 2) as "tx MB"
- -- round(total (total (length)) OVER (PARTITION BY status / 100) / 1024 / 1024, 2) AS "total tx MB"
-FROM
- logs
-GROUP BY
- "status"
-order by "tx MB" desc
-#+end_src
-
-#+RESULTS[6e8a6986b881fbf55f5e1328d505eb280c73f3b0]:
-| status | hits | tx MB |
-|--------+---------+----------|
-| 200 | 4255831 | 13463.72 |
-| 404 | 422650 | 1617.18 |
-| 400 | 3087 | 3.89 |
-| 500 | 790 | 1.37 |
-| 302 | 202 | 0.5 |
-| 304 | 240 | 0.3 |
-| 206 | 297 | 0.26 |
-| 403 | 29 | 0.11 |
-| 301 | 23 | 0.08 |
-| | 90 | 0.06 |
-| 421 | 7 | 0.03 |
-| 408 | 3 | 0.01 |
-#+begin_src sqlite
+#+begin_src sqlite :exports results
SELECT
- count(*),
- count(distinct ipid),
- count(distinct agentid),
- total (length),
- request_method,
- -- status,
- substr(path, 0, 50)
+ replace(path, '|', '%7C') AS Page,
+ round(total(length) / 1024 / 1024, 2) AS "tx MiB",
+ count(*) FILTER (WHERE agentid = 143) AS "OpenAI",
+ count(*) FILTER (WHERE agentid = 1) AS "ClaudeBot",
+ count(*) FILTER (WHERE agentid NOT IN (1, 143)) AS "Rest"
+ --substr(path, 0, 50)
FROM
logs
WHERE
- path NOT LIKE '/ingrid/%' AND
- status = 200
+ path NOT LIKE '/ingrid/%'
+ AND status = 404
GROUP BY
path
ORDER BY
- 1 DESC
-LIMIT 15
+ 2 DESC
+LIMIT 10
#+end_src
-#+RESULTS[8469c661009196d66d76364873866923b89c8205]:
-| count(*) | count(distinct ipid) | count(distinct agentid) | total (length) | request_method | substr(path, 0, 50) |
-|----------+----------------------+-------------------------+----------------+----------------+---------------------------------|
-| 3540 | 1779 | 425 | 22903047.0 | GET | / |
-| 2171 | 1563 | 48 | 9744532.0 | GET | /robots.txt |
-| 855 | 518 | 50 | 4234481.0 | GET | /favicon.ico |
-| 334 | 262 | 31 | 1304042.0 | GET | /cgit.css |
-| 150 | 116 | 59 | 1030405.0 | GET | /hugo-minimalist-theme/about/ |
-| 144 | 1 | 1 | 854165.0 | GET | /?XDEBUG_SESSION_START=phpstorm |
-| 119 | 69 | 52 | 421470.0 | GET | /cgit.png |
-| 98 | 36 | 14 | 330051.0 | GET | /hugo-minimalist-theme/ |
-| 46 | 31 | 15 | 165446.0 | GET | /homepage/ |
-| 42 | 32 | 13 | 157112.0 | GET | /dotfiles/ |
-| 39 | 34 | 14 | 255204.0 | GET | /hugo-minimalist-theme/log/ |
-| 37 | 30 | 13 | 132913.0 | GET | /hugo-minimalist-theme/tree/ |
-| 37 | 33 | 11 | 103541.0 | GET | /?s=name |
-| 36 | 33 | 12 | 242095.0 | GET | /dotfiles/log/ |
-| 35 | 32 | 14 | 124829.0 | GET | /dotfiles/tree/ |
+#+name: fail-pages
+#+caption: Top 10: =404= error not found pages.
+#+attr_html: :class fail-pages
+#+RESULTS[c7d6ef8ed6eac11d94e7e007c99399963237da56]:
+| Page | tx MiB | OpenAI | ClaudeBot | Rest |
+|--------------------------------------------------------------------------------------+--------+--------+-----------+------|
+| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.RelPermalink%20%7D%7D | 8.36 | 2805 | 3 | 7 |
+| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.URL%20%7D%7D | 5.39 | 1629 | 1 | 13 |
+| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.%20%7D%7D | 4.82 | 1559 | 1 | 4 |
+| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$href%20%7D%7D | 4.28 | 1209 | 4 | 5 |
+| /.env | 3.84 | 0 | 0 | 744 |
+| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.Permalink%20%7D%7D | 3.75 | 1060 | 2 | 15 |
+| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$pag.Next.URL%20%7D%7D | 3.36 | 916 | 1 | 7 |
+| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$pag.Prev.URL%20%7D%7D | 3.34 | 912 | 0 | 7 |
+| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20if%20ne%20.MediaType.SubType | 2.95 | 817 | 1 | 0 |
+| /hugo-minimalist-theme/plain/layouts/taxonomy/%7B%7B%20.Name%20%7C%20urlize%20%7D%7D | 2.86 | 745 | 5 | 0 |
-#+begin_src sqlite
+#+begin_src sqlite :exports results
SELECT
- count(*) as hits,
- sum(length),
- agentid,
- path
+ round(total(length) / 1024 / 1024, 2) AS "tx MiB",
+ count(*),
+ count(distinct agentid) agents,
+ replace(path, '|', '%7C') AS path
--substr(path, 0, 50)
FROM
logs
WHERE
path NOT LIKE '/ingrid/%'
AND status = 404
+ and agentid NOT IN (1, 143)
GROUP BY
path
ORDER BY
- hits desc
-limit 10
+ 1 DESC
+LIMIT 20
#+end_src
-#+RESULTS[3dbf9422fb49ecb2cae63c67615825ef1b1c30a9]:
-| hits | sum(length) | agentid | path | |
-|------+-------------+---------+-------------------------------------------------------------------------------------+--------------------|
-| 2815 | 8768239 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.RelPermalink%20%7D%7D | |
-| 1643 | 5653820 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.URL%20%7D%7D | |
-| 1564 | 5049557 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.%20%7D%7D | |
-| 1218 | 4488059 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$href%20%7D%7D | |
-| 1077 | 3930371 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.Permalink%20%7D%7D | |
-| 924 | 3522501 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$pag.Next.URL%20%7D%7D | |
-| 919 | 3500841 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$pag.Prev.URL%20%7D%7D | |
-| 818 | 3088624 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20if%20ne%20.MediaType.SubType | |
-| 807 | 2530428 | 143 | /hugo-minimalist-theme/plain/layouts/%7B%7B%20.Type%20%7D%7D | |
-| 750 | 3000020 | 143 | /hugo-minimalist-theme/plain/layouts/taxonomy/%7B%7B%20.Name%20 | %20urlize%20%7D%7D |
+#+RESULTS[5d7adfe6034ad15603705e1501ba90fadc0a8f1c]:
+| tx MiB | count(*) | agents | path |
+|--------+----------+--------+----------------------------------------------------------------------------------------------------------|
+| 3.84 | 744 | 368 | /.env |
+| 2.26 | 409 | 1 | /cgi-bin/luci/;stok=/locale |
+| 2.02 | 381 | 182 | /.git/config |
+| 1.08 | 195 | 1 | /actuator/gateway/routes |
+| 0.88 | 173 | 2 | /hello.world?%ADd+allow_url_include%3d1+%ADd+auto_prepend_file%3dphp://input |
+| 0.65 | 119 | 6 | /sitemap.xml |
+| 0.57 | 222 | 12 | /vendor/phpunit/phpunit/src/Util/PHP/eval-stdin.php |
+| 0.55 | 72 | 10 | /dotfiles/tree/config/doom/snippets/org-mode/daily?h=semgrep&id=0b30f0bf0d1697504e05630b387b8e32302c3f7c |
+| 0.54 | 71 | 10 | /dotfiles/tree/config/doom/snippets/org-mode/daily?h=semgrep&id=5b57ec9f30c80a391a1f7ac5c3a69a91658e2c53 |
+| 0.47 | 102 | 10 | /config.json |
+| 0.45 | 119 | 15 | /api/.env |
+| 0.42 | 103 | 55 | /_profiler/phpinfo |
+| 0.39 | 87 | 10 | /.env.prod |
+| 0.37 | 93 | 11 | /.env.save |
+| 0.37 | 92 | 9 | /.env.production |
+| 0.35 | 64 | 3 | /src/.git/config |
+| 0.35 | 63 | 1 | /server/.git/config |
+| 0.35 | 63 | 2 | /media/.git/config |
+| 0.35 | 63 | 3 | /cms/.git/config |
+| 0.35 | 68 | 1 | /95.111.247.99/.env |
+
#+begin_src sqlite :results value file :file fails.csv
SELECT
@@ -387,10 +381,11 @@ GROUP BY
path
ORDER BY
hits desc, path
-limit 550
+limit 50
+
#+end_src
-#+RESULTS[28ba3f6de596e93fd80baf9516e5ae32a8239c72]:
+#+RESULTS[2972512bc0b63468a3eba78fde5efcce703e5d56]:
[[file:fails.csv]]
#+begin_src sqlite
@@ -479,3 +474,90 @@ ORDER BY
#+RESULTS[e3501152f1ec3e7ff822a6713736ff35fb0dcb46]:
[[file:leadfail.csv]]
+#+begin_src sqlite
+select
+ datetime (date, 'unixepoch'),
+ agentid,
+ count(*),
+ path
+from logs
+where path like '/hello.world?%'
+group by agentid
+#+end_src
+
+#+RESULTS[2f837a0274a09a9c00545946c8992d57286667ff]:
+| datetime (date, 'unixepoch') | agentid | count(*) | path |
+|------------------------------+---------+----------+------------------------------------------------------------------------------|
+| 2025-04-12 23:49:39 | 12 | 167 | /hello.world?%ADd+allow_url_include%3d1+%ADd+auto_prepend_file%3dphp://input |
+| 2025-02-08 23:48:46 | 238 | 6 | /hello.world?%ADd+allow_url_include%3d1+%ADd+auto_prepend_file%3dphp://input |
+
+#+begin_src sqlite
+select * from agent where id in (12, 238)
+#+end_src
+
+#+RESULTS[6d8a93a2351c1668ba5a7b975fdbbc6271cace36]:
+| id | user_agent |
+|-----+-----------------------------------------------------------------------|
+| 12 | Custom-AsyncHttpClient |
+| 238 | Mozilla/5.0 (Linux; Linux x86_64; en-US) Gecko/20100101 Firefox/122.0 |
+
+#+begin_src sqlite
+SELECT
+ round(total (length) / 1024, 2) "tx KiB",
+ count(*) hits,
+ agentid,
+ user_agent
+FROM
+ logs
+ JOIN agent ON agentid = id
+ --where agentid in (
+ --select id from agent
+WHERE
+ user_agent LIKE '%google%'
+ --)
+GROUP BY
+ id
+ORDER BY
+ 1 DESC
+#+end_src
+
+
+#+begin_src sqlite
+SELECT
+ count(*),
+ count(distinct ipid),
+ count(distinct agentid),
+ total (length),
+ request_method,
+ -- status,
+ substr(path, 0, 50)
+FROM
+ logs
+WHERE
+ path NOT LIKE '/ingrid/%' AND
+ status = 200
+GROUP BY
+ path
+ORDER BY
+ 1 DESC
+LIMIT 15
+#+end_src
+
+#+RESULTS[8469c661009196d66d76364873866923b89c8205]:
+| count(*) | count(distinct ipid) | count(distinct agentid) | total (length) | request_method | substr(path, 0, 50) |
+|----------+----------------------+-------------------------+----------------+----------------+---------------------------------|
+| 3540 | 1779 | 425 | 22903047.0 | GET | / |
+| 2171 | 1563 | 48 | 9744532.0 | GET | /robots.txt |
+| 855 | 518 | 50 | 4234481.0 | GET | /favicon.ico |
+| 334 | 262 | 31 | 1304042.0 | GET | /cgit.css |
+| 150 | 116 | 59 | 1030405.0 | GET | /hugo-minimalist-theme/about/ |
+| 144 | 1 | 1 | 854165.0 | GET | /?XDEBUG_SESSION_START=phpstorm |
+| 119 | 69 | 52 | 421470.0 | GET | /cgit.png |
+| 98 | 36 | 14 | 330051.0 | GET | /hugo-minimalist-theme/ |
+| 46 | 31 | 15 | 165446.0 | GET | /homepage/ |
+| 42 | 32 | 13 | 157112.0 | GET | /dotfiles/ |
+| 39 | 34 | 14 | 255204.0 | GET | /hugo-minimalist-theme/log/ |
+| 37 | 30 | 13 | 132913.0 | GET | /hugo-minimalist-theme/tree/ |
+| 37 | 33 | 11 | 103541.0 | GET | /?s=name |
+| 36 | 33 | 12 | 242095.0 | GET | /dotfiles/log/ |
+| 35 | 32 | 14 | 124829.0 | GET | /dotfiles/tree/ |