112 lines
2.3 KiB
Ruby
Executable File
112 lines
2.3 KiB
Ruby
Executable File
#!/usr/bin/env ruby
|
|
|
|
require "uri"
|
|
require "digest"
|
|
require "time"
|
|
require "nokogiri"
|
|
require "ferrum"
|
|
|
|
def absolutize_url(url, base_url)
|
|
return url if url.nil? || url.empty?
|
|
return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i)
|
|
|
|
URI.join(base_url, url).to_s
|
|
rescue URI::InvalidURIError
|
|
url
|
|
end
|
|
|
|
def absolutize_srcset(srcset, base_url)
|
|
srcset.split(",").map do |entry|
|
|
parts = entry.strip.split(/\s+/, 2)
|
|
next if parts.empty?
|
|
|
|
src = absolutize_url(parts[0], base_url)
|
|
descriptor = parts[1]
|
|
|
|
[src, descriptor].compact.join(" ")
|
|
end.compact.join(", ")
|
|
end
|
|
|
|
def absolutize_links!(doc, base_url)
|
|
%w[href src poster].each do |attr|
|
|
doc.css("[#{attr}]").each do |node|
|
|
node[attr] = absolutize_url(node[attr].to_s.strip, base_url)
|
|
end
|
|
end
|
|
|
|
doc.css("[srcset]").each do |node|
|
|
node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url)
|
|
end
|
|
end
|
|
|
|
def text_from_html(html)
|
|
IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io|
|
|
io.write(html)
|
|
io.close_write
|
|
io.read
|
|
end
|
|
end
|
|
|
|
*urls = ARGV
|
|
clipboard = ""
|
|
|
|
unless urls.any?
|
|
warn "Please supply one or more URLs"
|
|
exit 1
|
|
end
|
|
|
|
puts "references:"
|
|
|
|
browser = Ferrum::Browser.new(
|
|
timeout: 30,
|
|
process_timeout: 30,
|
|
browser_options: { "no-sandbox": nil }
|
|
)
|
|
page = browser.create_page
|
|
|
|
begin
|
|
urls.each do |url|
|
|
begin
|
|
page.goto(url)
|
|
page.network.wait_for_idle(timeout: 10)
|
|
|
|
html = page.body.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
|
doc = Nokogiri::HTML(html)
|
|
absolutize_links!(doc, url)
|
|
|
|
title = doc.at("title")&.text&.strip
|
|
raise "No title found" if title.to_s.empty?
|
|
|
|
text_content = text_from_html(doc.to_html)
|
|
rescue => ex
|
|
warn "Archive error (#{ex}; #{url})"
|
|
exit 1
|
|
end
|
|
|
|
hash = Digest::MD5.base64digest(url + text_content)
|
|
.scan(/[a-z0-9]/i)
|
|
.first(6)
|
|
.join
|
|
.downcase
|
|
|
|
filename = "#{URI.parse(url).host.gsub(".", "-")}-#{hash}.txt"
|
|
|
|
File.write("static/archive/#{filename}", text_content)
|
|
|
|
yaml = <<~STR
|
|
- title: "#{title}"
|
|
url: #{url}
|
|
date: #{Time.now.utc.iso8601}
|
|
file: #{filename}
|
|
STR
|
|
|
|
puts yaml
|
|
|
|
clipboard += yaml
|
|
end
|
|
ensure
|
|
browser.quit
|
|
end
|
|
|
|
IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }
|