Skip to content

Commit

Permalink
check-urls-tub new name
Browse files Browse the repository at this point in the history
git-svn-id: svn://tug.org/tugboat/trunk@618 7237770a-693a-0410-b4d3-c34f48dbc3f6
  • Loading branch information
kberry committed Feb 9, 2025
1 parent c408745 commit eef3f4a
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 36 deletions.
7 changes: 7 additions & 0 deletions misc/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
2025-02-09 Karl Berry <karl@freefriends.org>

* check-urls-tub: rename from check-pdf-urls-tub
and allow url on command line and --file option.

* check-balance-tub: --no-paren-check to omit.

2024-11-16 Karl Berry <karl@freefriends.org>

* toctxt-tub: new script for creating the txt file emailed to tex-eds.
Expand Down
14 changes: 12 additions & 2 deletions misc/check-balance-tub
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,28 @@
# Check for balanced parens/braces/brackets/quotes within each paragraph.
# Lots of false positives are likely, but still worth doing.

use strict; use warnings;

exit (&main ());

sub main {
my $paren_check;
if ($ARGV[0] eq "--no-paren-check") {
shift;
$paren_check = 0;
} else {
$paren_check = 1;
}

$/ = ""; # check a paragraph at a time
while (<>) {
next if /^\\end(article|\{document\})/;
s/\$''\$//; # common inch marks
s/```//; # markdown triple quotes
# remove % comments, a common and ignorable culprit,
# but don't remove \% or "% (printf format strings).
s/(^|[^\\"])%.*$/\1/mg;
&check_balance ('\(', '\)', $_);
s/(^|[^\\"])%.*$/$1/mg;
&check_balance ('\(', '\)', $_) if $paren_check;
&check_balance ('\{', '\}', $_);
&check_balance ('\[', '\]', $_);
&check_balance ("``", "''", $_);
Expand Down
34 changes: 0 additions & 34 deletions misc/check-pdf-urls-tub

This file was deleted.

66 changes: 66 additions & 0 deletions misc/check-urls-tub
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash
# $Id$
# License: do what you want to.
# Originally written by Max Chernoff, 31jul2024.
# Extract urls from a pdf, or as given, and check them. Assume GNU grep
# and bash (for PIPESTATUS).

exit_status=1
mydomain=freefriends.org #`hostname | perl -pe 's/.*\..*\..*$/'`
ckurls () {
wget -i- -nv --no-check-cert --wait=1 --timeout=8 --tries=0 --spider \
--user-agent="Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0" \
--referer=https://$mydomain \
2>&1
test $exit_status -eq 0 && exit_status=$?
}

for f in "$@"; do
if test "x$f" = x--file; then
shift
# take urls from file
if test -s "$1"; then
shift
cat "$1"
else
echo "$0: argument to --file has no urls: $1" >&1
exit 1
fi

# url given on command line
elif echo "$f" | grep '^http' >/dev/null; then
echo "$f"

else # assume it's a pdf
# We exclude adobe.com since it does not respond to wget requests
# (but maybe it will with user-agent, etc., so try agan)
#
# Exclude "." which happens with spelatex (tb142).
#
#With wget v1.xx (which is what the tug.org server is using):
qpdf --qdf "$f" - \
| grep -aoP '(?<=/URI \()[^)]*(?=\))' \
| grep -avF example.org \
| grep -avF xadobe.com \
| grep -av '^\.' \
| sort -u
fi
done | tee /tmp/ckurl.in \
| ckurls \
| sed -e 's/^.* URL: //' -e 's/:$//' \
| tee /tmp/ckurl

exit $exit_status

# output looks like:
# 2024-08-03 08:55:39 URL: https://github.com/borisveytsman/bookshelf 200 OK
# or on error:
# https://www.perplexity.ai/:
# Remote file does not exist -- broken link!!!
# which is why we remove trailing colons in the last sed.

#With wget v2.xx (which is what I have installed locally), try:
# qpdf --qdf /path/to/file.pdf - | grep -aoP '(?<=/URI \()[^)]*(?=\))' | wget --spider -i- | grep ^HTTP

# Basic idea for checking urls in html:
# @links = $html =~ m/<a[^>]+href\s*=\s*["']?([^"'> ]+)/ig;

0 comments on commit eef3f4a

Please sign in to comment.