httparty post fetch kanxue blogs
使用post httparty
cat scripts/get_blogs_from_kanxue_use_httparty.rb
ENV['RAILS_ENV'] = ARGV.first || ENV['RAILS_ENV'] || 'development'
require File.expand_path(File.dirname(__FILE__) + "/../config/environment")
require 'rails'
require 'json'
require 'rubygems'
require 'httparty'
require 'nokogiri'
Rails.logger = Logger.new("log/update_kanxue_blogs_using_httparty.log")
i = 19
loop do
url = "https://www.kanxue.com/homepost-morearticle.htm"
query = {
"page" => "#{i}",
"pagesize" => "10"
}
headers = {
"Host" => "www.kanxue.com",
"User-Agent" => "Mozilla/5.0 (X11; Linux x86_64; rb:108.0) Gecko/20100101 Firefox/108.0",
"Accept" => "text/plain, */*; q=0.01",
"Accept-Language" => "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding" => "gzip, deflate, br",
"Referer" => "https://www.kanxue.com/",
"Content-Type" => "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With" => "XMLHttpRequest",
"Content-Length" => "18",
"Origin" => "https://www.kanxue.com",
"Connection" => "keep-alive",
"Cookie" => "__jsluid_s=61f26b9d4fbbe7b03612225e7a61b899; Hm_lvt_820e73ad7ccba42be0e5b528c537e327=1666926727,1667283362; __jsluid_h=dfdce607d78e6fd261a8a092f8d3eb39; tree_expandedNodes=%2C1001600%2C1001671%2C1000000%2C1000001%2C1000002%2C1001202%2C1000713%2C1001372%2C1001373%2C1000003%2C1000965%2C1001189%2C1000966%2C1000972%2C1001522%2C; PHPSESSID=278ff8bf0af353e637c64377aad1716c",
"Sec-Fetch-Dest" => "empty",
"Sec-Fetch-Mode" => "cors",
"Sec-Fetch-Site" => "same-origin",
"DNT": "1"
}
Rails.logger.info "==== step 1 httparty post, query: #{query}, url: #{url}"
response = HTTParty.post(
"#{url}",
:query => query,
:headers => headers
)
result = JSON.parse(response)
Rails.logger.info "=== step 2 after post, result: #{result} response.code, #{response.code} ===response.headers is #{response.headers} query: #{query}"
blogs = result['message']['list']
blog_local = ''
if blogs.present?
blogs.each do |blog|
Rails.logger.info "=== step 3 before if blog_url #{blog['source_url'].inspect}"
if blog['source_url'].present?
Rails.logger.info "==== step 3.1 in present blog_url: #{blog['source_url']} title: #{blog['subject']} created_at: #{blog['create_date_fmt']}"
blog_local = Blog.where('blog_url = ? and source_website = ?', blog['source_url'], 'kanxue').first
if blog_local.blank?
blog_local = Blog.create blog_url: blog['source_url'], title: blog['subject'], created_at: blog['create_date_fmt'], author: blog['userinfo']['username'], image_url: "https://www.kanxue.com/#{blog['pic_fmt']}", brief: blog['brief'], views: blog['views'].to_i, source_website: 'kanxue'
sleep 10
end
else
blog_url = "https://zhuanlan.kanxue.com/article-#{blog['articleid']}.htm"
Rails.logger.info "==== step 3.2 in else blog_url: #{blog_url} title: #{blog['subject']} created_at: #{blog['create_date_fmt']}"
blog_local = Blog.where('blog_url = ? and source_website = ?', blog_url, 'kanxue').first
if blog_local.blank?
blog_local = Blog.create blog_url: blog_url, title: blog['subject'], created_at: blog['create_date_fmt'], author: blog['userinfo']['username'], image_url: "https://www.kanxue.com/#{blog['pic_fmt']}", brief: blog['brief'], views: blog['views'].to_i, source_website: 'kanxue'
sleep 10
end
end category_names = blog['tags'].split(',')
if category_names.present?
category_names.each do |category_name|
category_local = Category.where('name = ? and blog_id = ?', category_name, blog_local.id).first
Rails.logger.info "==== step 4 in create category, category_local is #{category_local.inspect} category_name: #{category_name} blog_local.id #{blog_local.id}"
Category.create name: category_name, blog_id: blog_local.id if category_local.blank?
end
end
end
end
sleep 50
i = i + 1
if i > 1650
break
end end