ContentGrabbingJi/src/main/java/net/lamgc/cgj/pixiv/PixivDownload.java
2020-03-26 14:39:59 +08:00

534 lines
24 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package net.lamgc.cgj.pixiv;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import io.netty.handler.codec.http.HttpHeaderNames;
import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.message.BasicHeader;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiConsumer;
public class PixivDownload {
private final static Logger log = LoggerFactory.getLogger("PixivDownload");
private final HttpClient httpClient;
private final CookieStore cookieStore;
/**
* 构造一个PixivDownload对象
* @param cookieStore 存在已登录Pixiv的CookieStore对象
*/
public PixivDownload(CookieStore cookieStore) {
this(cookieStore, null);
}
/**
* 构造一个PixivDownload对象
* @param cookieStore 存在已登录Pixiv的CookieStore对象
* @param proxy 访问代理
*/
public PixivDownload(CookieStore cookieStore, HttpHost proxy) {
this.cookieStore = cookieStore;
HttpClientBuilder builder = HttpClientBuilder.create();
builder.setDefaultCookieStore(cookieStore);
// UA: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36
ArrayList<Header> defaultHeaders = new ArrayList<>(2);
defaultHeaders.add(new BasicHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"));
builder.setDefaultHeaders(defaultHeaders);
builder.setProxy(proxy);
httpClient = builder.build();
}
public HttpClient getHttpClient() {
return httpClient;
}
public CookieStore getCookieStore() {
return cookieStore;
}
/**
* 获取帐号所有的收藏插画,并以输入流形式提供
* @param fn 回调函数函数传进的InputStream无需手动关闭
* @throws IOException 当获取时发生异常则直接抛出
*/
public void getCollectionAsInputStream(PageQuality quality, BiConsumer<String, InputStream> fn) throws IOException {
int pageIndex = 0;
HttpGet request;
Document document;
ArrayList<String> linkList = new ArrayList<>();
do {
request = new HttpGet(PixivURL.PIXIV_USER_COLLECTION_PAGE.replace("{pageIndex}", Integer.toString(++pageIndex)));
setCookieInRequest(request, cookieStore);
log.info("Request Link: " + request.getURI().toString());
HttpResponse response = httpClient.execute(request);
// 解析网页内容,获得所有的收藏信息
document = Jsoup.parse(EntityUtils.toString(response.getEntity(), StandardCharsets.UTF_8));
Elements items = document.select(".display_editable_works .image-item a.work");
List<String> hrefList = items.eachAttr("href");
log.info("第 {} 页获取到的图片项数量: {}", pageIndex, hrefList.size());
if(hrefList.size() == 0) {
break;
}
Gson gson = new Gson();
for (String href : hrefList) {
HttpGet linkApiRequest = createHttpGetRequest(PixivURL.PIXIV_ILLUST_API_URL.replace("{illustId}", href.substring(href.lastIndexOf("/") + 1)));
log.debug(linkApiRequest.getURI().toString());
HttpResponse httpResponse = httpClient.execute(linkApiRequest);
JsonObject linkResult = gson.fromJson(EntityUtils.toString(httpResponse.getEntity()), JsonObject.class);
if(linkResult.get("error").getAsBoolean()) {
log.error("接口返回错误信息: {}", linkResult.get("message").getAsString());
continue;
}
JsonArray linkArray = linkResult.get("body").getAsJsonArray();
for (int i = 0; i < linkArray.size(); i++) {
JsonObject linkObject = linkArray.get(i).getAsJsonObject().get("urls").getAsJsonObject();
linkList.add(linkObject.get((quality == null ? PageQuality.ORIGINAL : quality).toString().toLowerCase()).getAsString());
}
}
} while(!document.select(".pager-container>.next").isEmpty());
log.info("获取完成.");
AtomicInteger count = new AtomicInteger(1);
linkList.forEach(link -> {
log.info("Next Link [{}]: {}", count.getAndIncrement(), link);
InputStream imageInputStream = null;
int tryCount = 0;
do {
try {
imageInputStream = getImageAsInputStream(httpClient, link);
} catch (IOException e) {
log.error("获取图片数据时发生异常", e);
if(++tryCount < 5) {
log.warn("即将重试[{} / 5]", tryCount);
}
}
} while(imageInputStream == null);
try(InputStream imageInput = new BufferedInputStream(imageInputStream, 256 * 1024)) {
log.debug("调用回调方法...");
fn.accept(link, imageInput);
log.debug("回调方法调用完成.");
} catch (IOException e) {
log.error("图片获取失败", e);
}
});
}
/**
* 获取推荐插图
* @param quality 图片质量
* @param fn 回调函数
* @throws IOException 当获取时发生异常则直接抛出
*/
public void getRecommendAsInputStream(PageQuality quality, BiConsumer<String, InputStream> fn) throws IOException {
HttpResponse response = httpClient.execute(createHttpGetRequest(PixivURL.PIXIV_INDEX_URL));
Document document = Jsoup.parse(EntityUtils.toString(response.getEntity()));
HttpClient imageClient = HttpClientBuilder.create().build();
Elements elements = document.select(".gtm-illust-recommend-zone>.image-item>.gtm-illust-recommend-thumbnail-link");
for(int illustIndex = 0; illustIndex < elements.size(); illustIndex++){
String href = elements.get(illustIndex).attr("href");
int illustId = Integer.parseInt(href.substring(href.lastIndexOf("/") + 1));
log.info("({}/{}) Href: {}, IllustID: {}", illustIndex + 1, elements.size(), href, illustId);
List<String> pageLinkList = getIllustAllPageDownload(httpClient, this.cookieStore, illustId, quality);
for (int linkIndex = 0; linkIndex < pageLinkList.size(); linkIndex++) {
String link = pageLinkList.get(linkIndex);
String fileName = link.substring(link.lastIndexOf("/") + 1);
log.info("({}/{})正在处理 {}", linkIndex, pageLinkList.size(), fileName);
InputStream imageInputStream = null;
int tryCount = 0;
do {
try {
imageInputStream = getImageAsInputStream(imageClient, link);
} catch (IOException e) {
log.error("获取图片数据时发生异常", e);
if(++tryCount < 5) {
log.warn("即将重试[{} / 5]", tryCount);
}
}
} while(imageInputStream == null);
try(InputStream pageInputStream = new BufferedInputStream(imageInputStream, 256 * 1024)) {
fn.accept(fileName, pageInputStream);
}
log.info("Done!");
}
log.info("IllustId {} 处理完成.", illustId);
}
}
/**
* 获取排行榜
* @param contentType 内容类型
* @param mode 查询模式
* @param time 查询时间
* @param range 从第一名开始的范围
* @param quality 图片质量
* @param fn 回调函数
* @throws IOException 当请求发生异常时抛出
*/
public void getRankingAsInputStream(PixivURL.RankingContentType contentType, PixivURL.RankingMode mode,
Date time, int range, PageQuality quality, RankingDownloadFunction fn) throws IOException {
getRankingAsInputStream(contentType, mode, time, 1, range, quality, fn);
}
/**
* 获取排行榜
* @param contentType 内容类型
* @param mode 查询模式
* @param time 查询时间
* @param rankStart 开始排行位(包括)
* @param range 范围
* @param quality 图片质量
* @param fn 回调函数
* @throws IOException 当请求发生异常时抛出
*/
public void getRankingAsInputStream(PixivURL.RankingContentType contentType, PixivURL.RankingMode mode,
Date time, int rankStart, int range, PageQuality quality, RankingDownloadFunction fn) throws IOException {
getRanking(contentType, mode, time, rankStart, range).forEach(rankInfo -> {
int rank = rankInfo.get("rank").getAsInt();
int illustId = rankInfo.get("illust_id").getAsInt();
int authorId = rankInfo.get("user_id").getAsInt();
String authorName = rankInfo.get("user_name").getAsString();
String title = rankInfo.get("title").getAsString();
log.info("当前到第 {}/{} 名(总共 {} 名), IllustID: {}, Author: ({}) {}, Title: {}", rank, rankStart + range - 1, range, illustId, authorId, authorName, title);
log.info("正在获取PagesLink...");
List<String> linkList;
try {
linkList = getIllustAllPageDownload(httpClient, this.cookieStore, illustId, quality);
} catch (IOException e) {
if(e.getMessage().equals("該当作品は削除されたか、存在しない作品IDです。")) {
log.warn("作品 {} 不存在.", illustId);
} else {
e.printStackTrace();
}
return;
}
log.info("PagesLink 获取完成, 总数: {}", linkList.size());
for (int pageIndex = 0; pageIndex < linkList.size(); pageIndex++) {
String downloadLink = linkList.get(pageIndex);
log.info("当前Page: {}/{}", pageIndex + 1, linkList.size());
try(InputStream imageInputStream = new BufferedInputStream(getImageAsInputStream(HttpClientBuilder.create().build(), downloadLink), 256 * 1024)) {
fn.download(rank, downloadLink, rankInfo.deepCopy(), imageInputStream);
} catch(IOException e) {
log.error("下载插画时发生异常", e);
return;
}
log.info("完成.");
}
});
}
public List<JsonObject> getRanking(PixivURL.RankingContentType contentType, PixivURL.RankingMode mode,
Date time, int rankStart, int range) throws IOException {
if(rankStart <= 0) {
throw new IllegalArgumentException("rankStart cannot be less than or equal to zero");
}
if(range <= 0) {
throw new IllegalArgumentException("range cannot be less than or equal to zero");
}
int startPage = (int) Math.ceil(rankStart / 50F);
int requestFrequency = (int) Math.ceil((rankStart + (range - 1)) / 50F);
int surplusQuantity = range;
boolean firstRequest = true;
Gson gson = new Gson();
ArrayList<JsonObject> results = new ArrayList<>();
for (int requestCount = startPage; requestCount <= requestFrequency && requestCount <= 10; requestCount++) {
int rangeStart = (requestCount - 1) * 50 + 1;
log.info("正在请求第 {} 到 {} 位排名榜 (第{}次请求, 共 {} 次)", rangeStart, rangeStart + 49, requestCount - startPage + 1, requestFrequency - startPage);
HttpGet request = createHttpGetRequest(PixivURL.getRankingLink(contentType, mode, time, requestCount, true));
log.info("Request URL: {}", request.getURI());
HttpResponse response = httpClient.execute(request);
String content = EntityUtils.toString(response.getEntity());
log.debug("Content: " + content);
JsonObject contentObject = gson.fromJson(content, JsonObject.class);
if(contentObject.has("error")) {
log.warn("接口报错, 返回信息: {}", contentObject.get("error").getAsString());
break;
}
JsonArray rankingArray = contentObject.getAsJsonArray("contents");
log.info("正在解析数据...");
//需要添加一个总量, 否则会完整跑完一次.
//检查是否为最后一次请求,和剩余量有多少
int firstRequestStartIndex = (rankStart % 50) - 1;
for (int rankIndex = firstRequest ? firstRequestStartIndex : 0; rankIndex < rankingArray.size() && surplusQuantity > 0; rankIndex++, surplusQuantity--) {
JsonElement jsonElement = rankingArray.get(rankIndex);
JsonObject rankInfo = jsonElement.getAsJsonObject();
int rank = rankInfo.get("rank").getAsInt();
int illustId = rankInfo.get("illust_id").getAsInt();
int authorId = rankInfo.get("user_id").getAsInt();
String authorName = rankInfo.get("user_name").getAsString();
String title = rankInfo.get("title").getAsString();
log.info("当前到第 {}/{} 名(总共 {} 名), IllustID: {}, Author: ({}) {}, Title: {}", rank, rankStart + range - 1, range, illustId, authorId, authorName, title);
results.add(rankInfo);
}
firstRequest = false;
log.info("第 {} 到 {} 位排名榜完成. (第{}次请求)", rangeStart, rangeStart + 49, requestCount);
}
if(requestFrequency > 10) {
log.warn("请求的排名榜范围超出所支持的范围, 已终止请求.");
}
return results;
}
/**
* 获取作品的预加载数据
* @param illustId 作品id
* @return 如果请求成功返回JsonObject, 失败返回null
* @throws IOException 当请求响应非200或请求发生异常时抛出.
*/
public JsonObject getIllustPreLoadDataById(int illustId) throws IOException {
HttpGet request = createHttpGetRequest(PixivURL.getPixivRefererLink(illustId));
HttpResponse response = httpClient.execute(request);
if(response.getStatusLine().getStatusCode() != 200) {
throw new IOException("Http响应码非200: " + response.getStatusLine());
}
Document document = Jsoup.parse(EntityUtils.toString(response.getEntity()));
Elements selectElements = document.select("#meta-preload-data");
if(selectElements.size() == 0) {
return null;
}
return new Gson().fromJson(selectElements.attr("content"), JsonObject.class);
}
@FunctionalInterface
public interface RankingDownloadFunction {
/**
* 接收图片InputStream
* @param rank 当前作品排名
* @param link 作品下载链接
* @param inputStream 作品下载输入流, InputStream会自动关闭
*/
void download(int rank, String link, JsonObject rankInfo, InputStream inputStream);
}
public HttpGet createHttpGetRequest(String url) {
HttpGet request = new HttpGet(url);
setCookieInRequest(request, cookieStore);
return request;
}
/**
* 取Illust所有页的原图下载链接
* @param httpClient 用于发起请求的HttpClient对象
* @param illustId 插画ID
* @param quality 页质量, 见{@link PageQuality}
* @return 返回该illust所有Page的下载链接
* @throws IOException 当HttpClient在请求时发生异常, 或接口报错时抛出, 注意{@link IOException#getMessage()}
*/
public static List<String> getIllustAllPageDownload(HttpClient httpClient, CookieStore cookieStore, int illustId, PageQuality quality) throws IOException {
HttpGet linkApiRequest = new HttpGet(PixivURL.PIXIV_ILLUST_API_URL.replace("{illustId}", Integer.toString(illustId)));
setCookieInRequest(linkApiRequest, cookieStore);
HttpResponse response = httpClient.execute(linkApiRequest);
JsonObject resultObject = new Gson().fromJson(EntityUtils.toString(response.getEntity()), JsonObject.class);
if(resultObject.get("error").getAsBoolean()) {
String message = resultObject.get("message").getAsString();
log.info("请求错误, 错误信息: {}", message);
throw new IOException(message);
}
JsonArray linkArray = resultObject.getAsJsonArray("body");
ArrayList<String> resultList = new ArrayList<>();
String qualityType = quality == null ? "original" : quality.toString().toLowerCase();
log.info("已选择插画类型: {}", qualityType);
linkArray.forEach(el -> {
JsonObject urlObj = el.getAsJsonObject().getAsJsonObject("urls");
resultList.add(urlObj.get(qualityType).getAsString());
});
return resultList;
}
/**
* 插图质量
*/
public enum PageQuality{
/**
* 原图画质
*/
ORIGINAL,
/**
* 常规画质
*/
REGULAR,
/**
* 小图画质
*/
SMALL,
/**
* 迷你画质
*/
THUMB_MINI
}
/**
* 获取帐号所有的收藏插画,并以输入流形式提供
* @return 获取所有链接的InputStream, 请注意关闭InputStream
* @throws IOException 当获取时发生异常则直接抛出
*/
public Set<Map.Entry<String, InputStream>> getCollectionAsInputStream(PageQuality quality) throws IOException {
HashSet<Map.Entry<String, InputStream>> illustInputStreamSet = new HashSet<>();
getCollectionAsInputStream(quality, (link, inputStream) -> illustInputStreamSet.add(new AbstractMap.SimpleEntry<>(link, inputStream)));
return illustInputStreamSet;
}
/**
* 获取Pixiv图片
* @param httpClient HttpClient对象
* @param link Pixiv图片链接
* @return 返回图片InputStream注意关闭InputStream
* @throws IOException 获取失败时抛出
* @throws IllegalArgumentException 当链接无法处理时抛出
*/
public static InputStream getImageAsInputStream(HttpClient httpClient, String link) throws IOException {
HttpGet request = new HttpGet(link);
int startIndex = link.lastIndexOf("/");
int endIndex = link.lastIndexOf("_");
if(startIndex == -1 || endIndex == -1) {
throw new IllegalArgumentException("无法从链接获取illustID: " + link);
}
String referer = PixivURL.getPixivRefererLink(link.substring(startIndex + 1, endIndex));
request.addHeader(HttpHeaderNames.REFERER.toString(), referer);
HttpResponse response = httpClient.execute(request);
log.debug("response: {}", response);
log.info("Content Length: {}KB", Float.parseFloat(response.getFirstHeader(HttpHeaderNames.CONTENT_LENGTH.toString()).getValue()) / 1024F);
log.info("{}", response.getFirstHeader(HttpHeaderNames.CONTENT_TYPE.toString()));
return response.getEntity().getContent();
}
/**
* 登出当前会话.<br/>
* 登出成功后, 该Cookies作废.
* @return 返回是否成功登出
* @throws IOException 当登出请求异常时抛出
*/
public boolean logOut() throws IOException {
HttpGet request = new HttpGet(PixivURL.PIXIV_LOGOUT_URL);
request.setConfig(RequestConfig.custom().setRedirectsEnabled(false).build());
setCookieInRequest(request, cookieStore);
HttpResponse response = httpClient.execute(request);
if (response.getStatusLine().getStatusCode() == 302) {
cookieStore.clear();
return true;
} else {
return false;
}
}
/**
* 获取作品信息
* @param illustId 作品ID
* @return 成功获取返回JsonObject, 失败返回null, <br/>
* Json示例: <br/>
* <pre>
* {
* "illustId": "79584670",
* "illustTitle": "このヤンキーはウブすぎる",
* "id": "79584670",
* "title": "このヤンキーはウブすぎる",
* "illustType": 1,
* "xRestrict": 0,
* "restrict": 0,
* "sl": 2,
* "url": "https://i.pximg.net/c/360x360_70/img-master/img/2020/02/19/00/38/23/79584670_p0_square1200.jpg",
* "description": "",
* "tags": [
* "漫画",
* "オリジナル",
* "創作",
* "創作男女",
* "コロさん、ポリさん此方です!",
* "恋の予感",
* "あまずっぺー",
* "交換日記",
* "続編希望!!",
* "オリジナル10000users入り"
* ],
* "userId": "4778293",
* "userName": "隈浪さえ",
* "width": 3288,
* "height": 4564,
* "pageCount": 4,
* "isBookmarkable": true,
* "bookmarkData": null,
* "alt": "#オリジナル このヤンキーはウブすぎる - 隈浪さえ的漫画",
* "isAdContainer": false,
* "profileImageUrl": "https://i.pximg.net/user-profile/img/2019/12/04/18/56/19/16639046_fea29ce38ea89b0cb2313b40b3a72f9a_50.jpg",
* "type": "illust"
* }
* </pre>
* @throws IOException 当请求发生异常, 或接口返回错误信息时抛出.
*/
public JsonObject getIllustInfoByIllustId(int illustId) throws IOException {
HttpGet request = createHttpGetRequest(PixivURL.getPixivIllustInfoAPI(new int[] {illustId}));;
HttpResponse response = httpClient.execute(request);
String responseStr = EntityUtils.toString(response.getEntity());
log.debug("Response Content: {}", responseStr);
JsonObject responseObj = new Gson().fromJson(responseStr, JsonObject.class);
if(responseObj.get("error").getAsBoolean()) {
throw new IOException(responseObj.get("message").getAsString());
}
JsonArray illustsArray = responseObj.getAsJsonObject("body").getAsJsonArray("illusts");
if(illustsArray.size() == 1) {
return illustsArray.get(0).getAsJsonObject();
} else {
return null;
}
}
public static void setCookieInRequest(HttpRequest request, CookieStore cookieStore) {
StringBuilder builder = new StringBuilder();
cookieStore.getCookies().forEach(cookie -> builder.append(cookie.getName()).append("=").append(cookie.getValue()).append("; "));
request.setHeader(HttpHeaderNames.COOKIE.toString(), builder.toString());
}
}