diff options
-rw-r--r-- | webstats/plots.js | 53 | ||||
-rw-r--r-- | webstats/workforrobots.org | 356 |
2 files changed, 234 insertions, 175 deletions
diff --git a/webstats/plots.js b/webstats/plots.js index d861450..94458d9 100644 --- a/webstats/plots.js +++ b/webstats/plots.js @@ -266,6 +266,16 @@ function agentChart(header, series, container) { let uplot = new uPlot(opts, series, container); container["uobj"] = uplot; } + +function decodeURIOrPass(str) { + try { + return decodeURI(str); + } catch (e) { + console.error("error with", str); + return str; + } +} + addEventListener("load", () => { fetch("/top_agent_traffic.csv") .then(responseParseCSV) @@ -276,42 +286,9 @@ addEventListener("load", () => { agentChart(headers, series, cont); }); - fetch("/fails.csv") - .then(responseParseCSV) - .then((data) => { - let headers = data[0]; - return data - .slice(1) - .map((row) => [ - "tr", - ["td.org-right", scaling(parseFloat(row[0]))], - ["td.org-right", scaling(parseFloat(row[1])) + "B"], - ["td.org-left", decodeURI(row[2])], - ]); - }) - .then((series) => { - let table = [ - "table", - { - border: 2, - cellPadding: 6, - cellSpacing: 0, - rules: "groups", - frame: "hsides", - }, - [ - "thead", - [ - "tr", - ["th.org-right", "hits"], - ["th.org-right", "tx"], - ["th.org-left", "path"], - ], - ], - ]; - let tbody = ["tbody"].concat(series); - table.push(tbody); - console.log(table); - document.body.appendChild(domEl(table)); - }); + document.querySelectorAll("table.fail-pages tr").forEach((tr, idx) => { + if (idx !== 0) { + tr.children[0].textContent = decodeURIOrPass(tr.children[0].textContent); + } + }); }); diff --git a/webstats/workforrobots.org b/webstats/workforrobots.org index 76e7ddd..8cfeafe 100644 --- a/webstats/workforrobots.org +++ b/webstats/workforrobots.org @@ -167,7 +167,7 @@ downloads. There are other people besides them that downloaded, but in total, in the same time they only downloaded =≈21MiB=. That is =0.3%= compared to =ClaudeBot=. -#+begin_src sqlite +#+begin_src sqlite :exports results SELECT --agent.id, user_agent as "User Agent", @@ -203,174 +203,168 @@ order by total(length) desc * What are they looking at? The web front end of git repositories of course, but it there a pattern? -#+begin_src sqlite -#+end_src - -#+RESULTS[5c52f5ea84c4e903cbbbf8dd8f73139532de16dc]: +Table [[status-codes]] show the status codes of all requests performed by the users. +The failure rate of =OpenAI= is alarming. It from its =3.5 million= requests +=15%= are client errors, mostly not found pages of error =404=. What is their +scraper doing so wrong? =ClaudeBot= as noted earlier, manages to scrape with +half the requests and a failure rate of =1.6%=. +=Everybody else= are the =Rest=, all users that can't be grouped into the major +users. Yet they have a failure rate of =25%=, but that is normal as they are +mostly hacker robots scanning for vulnerabilities. -#+begin_src sqlite -select count(*), total(length), status -from logs -where agentid = 4602 -group by status -#+end_src - -#+RESULTS[91700bd4c58675dde008a24f8e07cfcea3966f58]: -| count(*) | total(length) | status | -|----------+---------------+--------| -| 207240 | 497513096.0 | 200 | -| 348 | 1195624.0 | 404 | -| 183 | 186683.0 | 500 | -#+begin_src sqlite -select count(distinct ipid) -, count(distinct agentid) -from logs -where -agentid not in (143, 1, 19, 6, 4602, 3, 2, 4, 10306, 9) -#+end_src - -#+RESULTS[efe7c2c9c481b475baccee8d9ed48d814003b23e]: -| count(distinct ipid) | count(distinct agentid) | -|----------------------+-------------------------| -| 35144 | 20648 | - -#+begin_src sqlite -select count(distinct ipid), -max(date), - max(datetime (date, 'unixepoch')) -from logs -where agentid = 4602 -#+end_src - -#+RESULTS[49b56e0faa552ea7df0d2515250f8f16365a7a40]: -| count(distinct ipid) | max(date) | max(datetime (date, 'unixepoch')) | -|----------------------+------------+-----------------------------------| -| 7 | 1742216884 | 2025-03-17 13:08:04 | - -#+begin_src sqlite +#+begin_src sqlite :exports results SELECT - round(total (length) / 1024, 2) "tx KiB", - count(*) hits, - agentid, - user_agent + CASE WHEN agentid = 143 THEN + 'OpenAI-GPTBot' + WHEN agentid = 1 THEN + 'ClaudeBot' + WHEN agentid = 19 THEN + 'Barkrowler' + WHEN agentid = 6 THEN + 'Macintosh' + WHEN agentid = 4602 THEN + 'Scrapy' + WHEN agentid = 3 THEN + 'PetalBot' + WHEN agentid = 2 THEN + 'AhrefsBot' + WHEN agentid = 4 THEN + 'Bytespider' + WHEN agentid = 10306 THEN + 'SeekportBot' + WHEN agentid = 9 THEN + 'Google' + ELSE + 'Everybody else' + END AS Agent, + --count(*) AS "Requests", + count(*) FILTER (WHERE status BETWEEN 200 AND 299) AS "2XX", + count(*) FILTER (WHERE status BETWEEN 300 AND 399) AS "3XX", + count(*) FILTER (WHERE status BETWEEN 400 AND 499) AS "4XX", + --(100.0 * count(*) FILTER (WHERE status BETWEEN 400 AND 499)) / count(*) AS "fails 4XX", + count(*) FILTER (WHERE status BETWEEN 500 AND 599) AS "5XX" FROM logs - JOIN agent ON agentid = id - --where agentid in ( - --select id from agent -WHERE - user_agent LIKE '%google%' - --) GROUP BY - id + Agent ORDER BY - 1 DESC + "4XX" DESC #+end_src +#+name: status-codes +#+caption: HTTP status codes per user agent +#+RESULTS[4939ff0e53fd1b72a7f39a19a3a50483893b07d5]: +| Agent | 2XX | 3XX | 4XX | fails 4XX | 5XX | +|----------------+---------+-----+--------+--------------------+-----| +| OpenAI-GPTBot | 3017848 | 0 | 554511 | 15.521738400215 | 121 | +| Everybody else | 99066 | 467 | 34630 | 25.8091923354971 | 14 | +| ClaudeBot | 1591179 | 26 | 25611 | 1.58360240950446 | 446 | +| Barkrowler | 272343 | 0 | 1618 | 0.590579921742685 | 7 | +| Macintosh | 79071 | 2 | 1086 | 1.3548073204506 | 0 | +| Bytespider | 13609 | 0 | 531 | 3.75477301654646 | 2 | +| PetalBot | 69223 | 0 | 473 | 0.678651878847009 | 1 | +| Scrapy | 207240 | 0 | 348 | 0.167492094661911 | 183 | +| AhrefsBot | 59733 | 0 | 90 | 0.150421179302046 | 9 | +| Google | 3576 | 0 | 2 | 0.0558971492453885 | 0 | +| SeekportBot | 2500 | 0 | 0 | 0.0 | 0 | + +Let's have a look at the most not found pages. Listed in table [[fail-pages]] are +each of the page paths, how much bandwidth(=tx=) each consumed, and then the +request per bots. With one exception, all pages are placeholder from the +template engine. The repository =hugo-minimalist-theme= is a [[https://gohugo.io][Hugo]] theme. Within +the curly braces ={{ }}= the rendering engine replaces values. Certainly the +html parser reads them raw an from the link =a= tag and requests the page. +=ClaudeBot= seems to track error pages and not query them again. =OpenAI= is +incapable of doing that, and stubbornly tries over and over. + +If you grep for the string /href="{{ .RelPermalink }}"/ over the entire git +history, you find it appears up to today =954= times. It is surprising how +=OpenAI= manage to request it =3= times that amount. -#+begin_src sqlite -SELECT - status, - count(*) as hits, - round(total (length) / 1024 / 1024, 2) as "tx MB" - -- round(total (total (length)) OVER (PARTITION BY status / 100) / 1024 / 1024, 2) AS "total tx MB" -FROM - logs -GROUP BY - "status" -order by "tx MB" desc -#+end_src - -#+RESULTS[6e8a6986b881fbf55f5e1328d505eb280c73f3b0]: -| status | hits | tx MB | -|--------+---------+----------| -| 200 | 4255831 | 13463.72 | -| 404 | 422650 | 1617.18 | -| 400 | 3087 | 3.89 | -| 500 | 790 | 1.37 | -| 302 | 202 | 0.5 | -| 304 | 240 | 0.3 | -| 206 | 297 | 0.26 | -| 403 | 29 | 0.11 | -| 301 | 23 | 0.08 | -| | 90 | 0.06 | -| 421 | 7 | 0.03 | -| 408 | 3 | 0.01 | -#+begin_src sqlite +#+begin_src sqlite :exports results SELECT - count(*), - count(distinct ipid), - count(distinct agentid), - total (length), - request_method, - -- status, - substr(path, 0, 50) + replace(path, '|', '%7C') AS Page, + round(total(length) / 1024 / 1024, 2) AS "tx MiB", + count(*) FILTER (WHERE agentid = 143) AS "OpenAI", + count(*) FILTER (WHERE agentid = 1) AS "ClaudeBot", + count(*) FILTER (WHERE agentid NOT IN (1, 143)) AS "Rest" + --substr(path, 0, 50) FROM logs WHERE - path NOT LIKE '/ingrid/%' AND - status = 200 + path NOT LIKE '/ingrid/%' + AND status = 404 GROUP BY path ORDER BY - 1 DESC -LIMIT 15 + 2 DESC +LIMIT 10 #+end_src -#+RESULTS[8469c661009196d66d76364873866923b89c8205]: -| count(*) | count(distinct ipid) | count(distinct agentid) | total (length) | request_method | substr(path, 0, 50) | -|----------+----------------------+-------------------------+----------------+----------------+---------------------------------| -| 3540 | 1779 | 425 | 22903047.0 | GET | / | -| 2171 | 1563 | 48 | 9744532.0 | GET | /robots.txt | -| 855 | 518 | 50 | 4234481.0 | GET | /favicon.ico | -| 334 | 262 | 31 | 1304042.0 | GET | /cgit.css | -| 150 | 116 | 59 | 1030405.0 | GET | /hugo-minimalist-theme/about/ | -| 144 | 1 | 1 | 854165.0 | GET | /?XDEBUG_SESSION_START=phpstorm | -| 119 | 69 | 52 | 421470.0 | GET | /cgit.png | -| 98 | 36 | 14 | 330051.0 | GET | /hugo-minimalist-theme/ | -| 46 | 31 | 15 | 165446.0 | GET | /homepage/ | -| 42 | 32 | 13 | 157112.0 | GET | /dotfiles/ | -| 39 | 34 | 14 | 255204.0 | GET | /hugo-minimalist-theme/log/ | -| 37 | 30 | 13 | 132913.0 | GET | /hugo-minimalist-theme/tree/ | -| 37 | 33 | 11 | 103541.0 | GET | /?s=name | -| 36 | 33 | 12 | 242095.0 | GET | /dotfiles/log/ | -| 35 | 32 | 14 | 124829.0 | GET | /dotfiles/tree/ | +#+name: fail-pages +#+caption: Top 10: =404= error not found pages. +#+attr_html: :class fail-pages +#+RESULTS[c7d6ef8ed6eac11d94e7e007c99399963237da56]: +| Page | tx MiB | OpenAI | ClaudeBot | Rest | +|--------------------------------------------------------------------------------------+--------+--------+-----------+------| +| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.RelPermalink%20%7D%7D | 8.36 | 2805 | 3 | 7 | +| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.URL%20%7D%7D | 5.39 | 1629 | 1 | 13 | +| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.%20%7D%7D | 4.82 | 1559 | 1 | 4 | +| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$href%20%7D%7D | 4.28 | 1209 | 4 | 5 | +| /.env | 3.84 | 0 | 0 | 744 | +| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.Permalink%20%7D%7D | 3.75 | 1060 | 2 | 15 | +| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$pag.Next.URL%20%7D%7D | 3.36 | 916 | 1 | 7 | +| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$pag.Prev.URL%20%7D%7D | 3.34 | 912 | 0 | 7 | +| /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20if%20ne%20.MediaType.SubType | 2.95 | 817 | 1 | 0 | +| /hugo-minimalist-theme/plain/layouts/taxonomy/%7B%7B%20.Name%20%7C%20urlize%20%7D%7D | 2.86 | 745 | 5 | 0 | -#+begin_src sqlite +#+begin_src sqlite :exports results SELECT - count(*) as hits, - sum(length), - agentid, - path + round(total(length) / 1024 / 1024, 2) AS "tx MiB", + count(*), + count(distinct agentid) agents, + replace(path, '|', '%7C') AS path --substr(path, 0, 50) FROM logs WHERE path NOT LIKE '/ingrid/%' AND status = 404 + and agentid NOT IN (1, 143) GROUP BY path ORDER BY - hits desc -limit 10 + 1 DESC +LIMIT 20 #+end_src -#+RESULTS[3dbf9422fb49ecb2cae63c67615825ef1b1c30a9]: -| hits | sum(length) | agentid | path | | -|------+-------------+---------+-------------------------------------------------------------------------------------+--------------------| -| 2815 | 8768239 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.RelPermalink%20%7D%7D | | -| 1643 | 5653820 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.URL%20%7D%7D | | -| 1564 | 5049557 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.%20%7D%7D | | -| 1218 | 4488059 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$href%20%7D%7D | | -| 1077 | 3930371 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20.Permalink%20%7D%7D | | -| 924 | 3522501 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$pag.Next.URL%20%7D%7D | | -| 919 | 3500841 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20$pag.Prev.URL%20%7D%7D | | -| 818 | 3088624 | 143 | /hugo-minimalist-theme/plain/layouts/partials/%7B%7B%20if%20ne%20.MediaType.SubType | | -| 807 | 2530428 | 143 | /hugo-minimalist-theme/plain/layouts/%7B%7B%20.Type%20%7D%7D | | -| 750 | 3000020 | 143 | /hugo-minimalist-theme/plain/layouts/taxonomy/%7B%7B%20.Name%20 | %20urlize%20%7D%7D | +#+RESULTS[5d7adfe6034ad15603705e1501ba90fadc0a8f1c]: +| tx MiB | count(*) | agents | path | +|--------+----------+--------+----------------------------------------------------------------------------------------------------------| +| 3.84 | 744 | 368 | /.env | +| 2.26 | 409 | 1 | /cgi-bin/luci/;stok=/locale | +| 2.02 | 381 | 182 | /.git/config | +| 1.08 | 195 | 1 | /actuator/gateway/routes | +| 0.88 | 173 | 2 | /hello.world?%ADd+allow_url_include%3d1+%ADd+auto_prepend_file%3dphp://input | +| 0.65 | 119 | 6 | /sitemap.xml | +| 0.57 | 222 | 12 | /vendor/phpunit/phpunit/src/Util/PHP/eval-stdin.php | +| 0.55 | 72 | 10 | /dotfiles/tree/config/doom/snippets/org-mode/daily?h=semgrep&id=0b30f0bf0d1697504e05630b387b8e32302c3f7c | +| 0.54 | 71 | 10 | /dotfiles/tree/config/doom/snippets/org-mode/daily?h=semgrep&id=5b57ec9f30c80a391a1f7ac5c3a69a91658e2c53 | +| 0.47 | 102 | 10 | /config.json | +| 0.45 | 119 | 15 | /api/.env | +| 0.42 | 103 | 55 | /_profiler/phpinfo | +| 0.39 | 87 | 10 | /.env.prod | +| 0.37 | 93 | 11 | /.env.save | +| 0.37 | 92 | 9 | /.env.production | +| 0.35 | 64 | 3 | /src/.git/config | +| 0.35 | 63 | 1 | /server/.git/config | +| 0.35 | 63 | 2 | /media/.git/config | +| 0.35 | 63 | 3 | /cms/.git/config | +| 0.35 | 68 | 1 | /95.111.247.99/.env | + #+begin_src sqlite :results value file :file fails.csv SELECT @@ -387,10 +381,11 @@ GROUP BY path ORDER BY hits desc, path -limit 550 +limit 50 + #+end_src -#+RESULTS[28ba3f6de596e93fd80baf9516e5ae32a8239c72]: +#+RESULTS[2972512bc0b63468a3eba78fde5efcce703e5d56]: [[file:fails.csv]] #+begin_src sqlite @@ -479,3 +474,90 @@ ORDER BY #+RESULTS[e3501152f1ec3e7ff822a6713736ff35fb0dcb46]: [[file:leadfail.csv]] +#+begin_src sqlite +select + datetime (date, 'unixepoch'), + agentid, + count(*), + path +from logs +where path like '/hello.world?%' +group by agentid +#+end_src + +#+RESULTS[2f837a0274a09a9c00545946c8992d57286667ff]: +| datetime (date, 'unixepoch') | agentid | count(*) | path | +|------------------------------+---------+----------+------------------------------------------------------------------------------| +| 2025-04-12 23:49:39 | 12 | 167 | /hello.world?%ADd+allow_url_include%3d1+%ADd+auto_prepend_file%3dphp://input | +| 2025-02-08 23:48:46 | 238 | 6 | /hello.world?%ADd+allow_url_include%3d1+%ADd+auto_prepend_file%3dphp://input | + +#+begin_src sqlite +select * from agent where id in (12, 238) +#+end_src + +#+RESULTS[6d8a93a2351c1668ba5a7b975fdbbc6271cace36]: +| id | user_agent | +|-----+-----------------------------------------------------------------------| +| 12 | Custom-AsyncHttpClient | +| 238 | Mozilla/5.0 (Linux; Linux x86_64; en-US) Gecko/20100101 Firefox/122.0 | + +#+begin_src sqlite +SELECT + round(total (length) / 1024, 2) "tx KiB", + count(*) hits, + agentid, + user_agent +FROM + logs + JOIN agent ON agentid = id + --where agentid in ( + --select id from agent +WHERE + user_agent LIKE '%google%' + --) +GROUP BY + id +ORDER BY + 1 DESC +#+end_src + + +#+begin_src sqlite +SELECT + count(*), + count(distinct ipid), + count(distinct agentid), + total (length), + request_method, + -- status, + substr(path, 0, 50) +FROM + logs +WHERE + path NOT LIKE '/ingrid/%' AND + status = 200 +GROUP BY + path +ORDER BY + 1 DESC +LIMIT 15 +#+end_src + +#+RESULTS[8469c661009196d66d76364873866923b89c8205]: +| count(*) | count(distinct ipid) | count(distinct agentid) | total (length) | request_method | substr(path, 0, 50) | +|----------+----------------------+-------------------------+----------------+----------------+---------------------------------| +| 3540 | 1779 | 425 | 22903047.0 | GET | / | +| 2171 | 1563 | 48 | 9744532.0 | GET | /robots.txt | +| 855 | 518 | 50 | 4234481.0 | GET | /favicon.ico | +| 334 | 262 | 31 | 1304042.0 | GET | /cgit.css | +| 150 | 116 | 59 | 1030405.0 | GET | /hugo-minimalist-theme/about/ | +| 144 | 1 | 1 | 854165.0 | GET | /?XDEBUG_SESSION_START=phpstorm | +| 119 | 69 | 52 | 421470.0 | GET | /cgit.png | +| 98 | 36 | 14 | 330051.0 | GET | /hugo-minimalist-theme/ | +| 46 | 31 | 15 | 165446.0 | GET | /homepage/ | +| 42 | 32 | 13 | 157112.0 | GET | /dotfiles/ | +| 39 | 34 | 14 | 255204.0 | GET | /hugo-minimalist-theme/log/ | +| 37 | 30 | 13 | 132913.0 | GET | /hugo-minimalist-theme/tree/ | +| 37 | 33 | 11 | 103541.0 | GET | /?s=name | +| 36 | 33 | 12 | 242095.0 | GET | /dotfiles/log/ | +| 35 | 32 | 14 | 124829.0 | GET | /dotfiles/tree/ | |