From d2bf0140a6150aef066557a19c5b6eaaf6e3ac9e Mon Sep 17 00:00:00 2001 From: David Eisinger Date: Thu, 9 May 2024 10:07:55 -0400 Subject: [PATCH] Add elsewhere script --- bin/elsewhere/Gemfile | 7 ++++ bin/elsewhere/Gemfile.lock | 17 +++++++++ bin/elsewhere/elsewhere | 73 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 bin/elsewhere/Gemfile create mode 100644 bin/elsewhere/Gemfile.lock create mode 100755 bin/elsewhere/elsewhere diff --git a/bin/elsewhere/Gemfile b/bin/elsewhere/Gemfile new file mode 100644 index 0000000..33dd3d2 --- /dev/null +++ b/bin/elsewhere/Gemfile @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +# gem "rails" +gem "nokogiri" +gem "pandoc-ruby" \ No newline at end of file diff --git a/bin/elsewhere/Gemfile.lock b/bin/elsewhere/Gemfile.lock new file mode 100644 index 0000000..5e33d4d --- /dev/null +++ b/bin/elsewhere/Gemfile.lock @@ -0,0 +1,17 @@ +GEM + remote: https://rubygems.org/ + specs: + nokogiri (1.16.4-arm64-darwin) + racc (~> 1.4) + pandoc-ruby (2.1.10) + racc (1.7.3) + +PLATFORMS + arm64-darwin-22 + +DEPENDENCIES + nokogiri + pandoc-ruby + +BUNDLED WITH + 2.4.8 diff --git a/bin/elsewhere/elsewhere b/bin/elsewhere/elsewhere new file mode 100755 index 0000000..41e34b9 --- /dev/null +++ b/bin/elsewhere/elsewhere @@ -0,0 +1,73 @@ +#!/usr/bin/env ruby + +require "date" +require "net/http" +require "nokogiri" +require "pandoc-ruby" +require "pry" +require "uri" + +BASEDIR = "content/elsewhere" + +def download(uri, filename) + if File.exist?(filename) + puts "#{filename} already exists" + else + puts "Downloading #{uri}..." + `curl -s "#{uri}" > #{filename}` + end +end + +*urls = ARGV + +unless urls.any? + warn "Please supply one or more URLs" + exit 1 +end + +FileUtils.mkdir_p "tmp" + +urls.each do |url| + uri = URI.parse(url) + name = File.basename(uri.path) + filename = "tmp/#{name}.html" + + download(uri, filename) + + article = Nokogiri::HTML(File.open(filename)) + content = article.css("div.page-blocks").first + + FileUtils.mkdir_p "#{BASEDIR}/#{name}" + + md = PandocRuby.convert(content.to_html, from: :html, to: :markdown) + + # strip weird ::: directives + md.gsub!(/^:{3}.*\n/, "") + + # strip trailing "\" + md.gsub!(/\\$/, "") + + md.gsub!(/^!\[image\]\((.*?)\)\{.*?\}/m) do + img_uri = URI.parse($1) + img_filename = File.basename(img_uri.path) + + download(img_uri, "#{BASEDIR}/#{name}/#{img_filename}") + + "![image](#{img_filename})" + end + + title = article.css("title").text.gsub(" | Viget", "").gsub('"', '\"') + date = Date.parse(article.css("time").attr("datetime").value).to_datetime.to_s + + File.open("#{BASEDIR}/#{name}/index.md", "w") do |f| + f.write "---\n" + f.write "title: \"#{title}\"\n" + f.write "date: #{date}\n" + f.write "draft: false\n" + f.write "needs_review: true\n" + f.write "canonical_url: #{uri}\n" + f.write "---\n" + f.write "\n" + f.write md + end +end