145 lines
3.3 KiB
Ruby
Executable File
145 lines
3.3 KiB
Ruby
Executable File
#!/usr/bin/env ruby
|
|
|
|
require "uri"
|
|
require "digest"
|
|
require "time"
|
|
require "optparse"
|
|
require "nokogiri"
|
|
require "ferrum"
|
|
|
|
def absolutize_url(url, base_url)
|
|
return url if url.nil? || url.empty?
|
|
return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i)
|
|
|
|
URI.join(base_url, url).to_s
|
|
rescue URI::InvalidURIError
|
|
url
|
|
end
|
|
|
|
def absolutize_srcset(srcset, base_url)
|
|
srcset.split(",").map do |entry|
|
|
parts = entry.strip.split(/\s+/, 2)
|
|
next if parts.empty?
|
|
|
|
src = absolutize_url(parts[0], base_url)
|
|
descriptor = parts[1]
|
|
|
|
[src, descriptor].compact.join(" ")
|
|
end.compact.join(", ")
|
|
end
|
|
|
|
def absolutize_links!(doc, base_url)
|
|
%w[href src poster].each do |attr|
|
|
doc.css("[#{attr}]").each do |node|
|
|
node[attr] = absolutize_url(node[attr].to_s.strip, base_url)
|
|
end
|
|
end
|
|
|
|
doc.css("[srcset]").each do |node|
|
|
node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url)
|
|
end
|
|
end
|
|
|
|
def text_from_html(html)
|
|
IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io|
|
|
io.write(html)
|
|
io.close_write
|
|
io.read
|
|
end
|
|
end
|
|
|
|
options = {
|
|
manual: false,
|
|
browser_path: nil
|
|
}
|
|
|
|
OptionParser.new do |parser|
|
|
parser.banner = "Usage: bin/archive [--manual] [--browser-path PATH] URL [URL ...]"
|
|
|
|
parser.on("--manual", "Open a visible browser window so you can complete anti-bot challenges") do
|
|
options[:manual] = true
|
|
end
|
|
|
|
parser.on("--browser-path PATH", "Path to the browser binary to launch") do |path|
|
|
options[:browser_path] = path
|
|
end
|
|
end.parse!
|
|
|
|
urls = ARGV
|
|
clipboard = ""
|
|
|
|
unless urls.any?
|
|
warn "Please supply one or more URLs"
|
|
exit 1
|
|
end
|
|
|
|
puts "references:"
|
|
|
|
if options[:manual] && options[:browser_path].nil?
|
|
brave_path = "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
|
|
options[:browser_path] = brave_path if File.exist?(brave_path)
|
|
end
|
|
|
|
browser = Ferrum::Browser.new(
|
|
headless: !options[:manual],
|
|
timeout: 30,
|
|
process_timeout: options[:manual] ? 60 : 30,
|
|
browser_path: options[:browser_path],
|
|
browser_options: { "no-sandbox": nil }
|
|
)
|
|
page = browser.create_page
|
|
|
|
begin
|
|
urls.each do |url|
|
|
begin
|
|
page.goto(url)
|
|
|
|
if options[:manual]
|
|
warn "Manual mode: finish any challenge in the browser window for #{url}"
|
|
warn "Press Enter here once the page is fully loaded."
|
|
$stdin.gets
|
|
warn "Capturing page..."
|
|
else
|
|
page.network.wait_for_idle(timeout: 10)
|
|
end
|
|
|
|
html = page.body.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
|
doc = Nokogiri::HTML(html)
|
|
absolutize_links!(doc, url)
|
|
|
|
title = doc.at("title")&.text&.strip
|
|
raise "No title found" if title.to_s.empty?
|
|
|
|
text_content = text_from_html(doc.to_html)
|
|
rescue => ex
|
|
warn "Archive error (#{ex}; #{url})"
|
|
exit 1
|
|
end
|
|
|
|
hash = Digest::MD5.base64digest(url + text_content)
|
|
.scan(/[a-z0-9]/i)
|
|
.first(6)
|
|
.join
|
|
.downcase
|
|
|
|
filename = "#{URI.parse(url).host.gsub(".", "-")}-#{hash}.txt"
|
|
|
|
File.write("static/archive/#{filename}", text_content)
|
|
|
|
yaml = <<~STR
|
|
- title: "#{title}"
|
|
url: #{url}
|
|
date: #{Time.now.utc.iso8601}
|
|
file: #{filename}
|
|
STR
|
|
|
|
puts yaml
|
|
|
|
clipboard += yaml
|
|
end
|
|
ensure
|
|
browser.quit
|
|
end
|
|
|
|
IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }
|