Files
davideisinger.com/bin/archive
2026-04-08 01:24:16 -04:00

145 lines
3.3 KiB
Ruby
Executable File

#!/usr/bin/env ruby
require "uri"
require "digest"
require "time"
require "optparse"
require "nokogiri"
require "ferrum"
def absolutize_url(url, base_url)
return url if url.nil? || url.empty?
return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i)
URI.join(base_url, url).to_s
rescue URI::InvalidURIError
url
end
def absolutize_srcset(srcset, base_url)
srcset.split(",").map do |entry|
parts = entry.strip.split(/\s+/, 2)
next if parts.empty?
src = absolutize_url(parts[0], base_url)
descriptor = parts[1]
[src, descriptor].compact.join(" ")
end.compact.join(", ")
end
def absolutize_links!(doc, base_url)
%w[href src poster].each do |attr|
doc.css("[#{attr}]").each do |node|
node[attr] = absolutize_url(node[attr].to_s.strip, base_url)
end
end
doc.css("[srcset]").each do |node|
node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url)
end
end
def text_from_html(html)
IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io|
io.write(html)
io.close_write
io.read
end
end
options = {
manual: false,
browser_path: nil
}
OptionParser.new do |parser|
parser.banner = "Usage: bin/archive [--manual] [--browser-path PATH] URL [URL ...]"
parser.on("--manual", "Open a visible browser window so you can complete anti-bot challenges") do
options[:manual] = true
end
parser.on("--browser-path PATH", "Path to the browser binary to launch") do |path|
options[:browser_path] = path
end
end.parse!
urls = ARGV
clipboard = ""
unless urls.any?
warn "Please supply one or more URLs"
exit 1
end
puts "references:"
if options[:manual] && options[:browser_path].nil?
brave_path = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
options[:browser_path] = brave_path if File.exist?(brave_path)
end
browser = Ferrum::Browser.new(
headless: !options[:manual],
timeout: 30,
process_timeout: options[:manual] ? 60 : 30,
browser_path: options[:browser_path],
browser_options: { "no-sandbox": nil }
)
page = browser.create_page
begin
urls.each do |url|
begin
page.goto(url)
if options[:manual]
warn "Manual mode: finish any challenge in the browser window for #{url}"
warn "Press Enter here once the page is fully loaded."
$stdin.gets
warn "Capturing page..."
else
page.network.wait_for_idle(timeout: 10)
end
html = page.body.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
doc = Nokogiri::HTML(html)
absolutize_links!(doc, url)
title = doc.at("title")&.text&.strip
raise "No title found" if title.to_s.empty?
text_content = text_from_html(doc.to_html)
rescue => ex
warn "Archive error (#{ex}; #{url})"
exit 1
end
hash = Digest::MD5.base64digest(url + text_content)
.scan(/[a-z0-9]/i)
.first(6)
.join
.downcase
filename = "#{URI.parse(url).host.gsub(".", "-")}-#{hash}.txt"
File.write("static/archive/#{filename}", text_content)
yaml = <<~STR
- title: "#{title}"
url: #{url}
date: #{Time.now.utc.iso8601}
file: #{filename}
STR
puts yaml
clipboard += yaml
end
ensure
browser.quit
end
IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }