deviantart/types/
scraped_webpage_info.rs

1use super::Deviation;
2use super::Media;
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashMap;
6use url::Url;
7
8/// An error that may occur while parsing a [`ScrapedWebPageInfo`] from a html string.
9#[derive(Debug, thiserror::Error)]
10pub enum FromHtmlStrError {
11    /// Missing the InitialState variable
12    #[error("missing initial state")]
13    MissingInitialState,
14
15    /// Failed to parse some state
16    #[error(transparent)]
17    InvalidJson(#[from] serde_json::Error),
18}
19
20/// Info scraped from a deviation url
21#[derive(Debug, serde::Deserialize)]
22pub struct ScrapedWebPageInfo {
23    /// Page config like csrf tokens
24    #[serde(rename = "@@config")]
25    pub config: Config,
26
27    /// Deviations extended deviations maybe?
28    #[serde(rename = "@@entities")]
29    pub entities: Option<Entities>,
30
31    /// ?
32    #[serde(rename = "@@DUPERBROWSE")]
33    pub duper_browse: Option<DuperBrowse>,
34
35    /// Info about the current session
36    #[serde(rename = "@@publicSession")]
37    pub public_session: PublicSession,
38
39    /// Streams
40    #[serde(rename = "@@streams")]
41    pub streams: Option<Streams>,
42
43    /// Needed for login.
44    ///
45    /// Note that this is a different csrf token from the config struct.
46    #[serde(rename = "csrfToken")]
47    pub csrf_token: Option<Box<str>>,
48
49    #[serde(rename = "gallectionSection")]
50    pub gallection_section: Option<GallectionSection>,
51
52    /// Needed for login.
53    #[serde(rename = "luToken")]
54    pub lu_token: Option<Box<str>>,
55
56    /// Needed for login.
57    #[serde(rename = "luToken2")]
58    pub lu_token2: Option<Box<str>>,
59
60    /// Unknown data
61    #[serde(flatten)]
62    pub unknown: HashMap<String, serde_json::Value>,
63}
64
65impl ScrapedWebPageInfo {
66    /// Parse this from a html string
67    pub fn from_html_str(input: &str) -> Result<Self, FromHtmlStrError> {
68        static REGEX: Lazy<Regex> = Lazy::new(|| {
69            Regex::new(r#"window\.__INITIAL_STATE__ = JSON\.parse\("(.*)"\);"#).unwrap()
70        });
71
72        let capture = REGEX
73            .captures(input)
74            .and_then(|captures| captures.get(1))
75            .ok_or(FromHtmlStrError::MissingInitialState)?;
76        // TODO: Escape properly
77        let capture = capture
78            .as_str()
79            .replace("\\\"", "\"")
80            .replace("\\'", "'")
81            .replace("\\\\", "\\");
82        Ok(serde_json::from_str(&capture)?)
83    }
84
85    /// Returns `true` if logged in
86    pub fn is_logged_in(&self) -> bool {
87        self.public_session.is_logged_in
88    }
89
90    /// Get the current deviation's id
91    pub fn get_current_deviation_id(&self) -> Option<&serde_json::Value> {
92        Some(
93            &self
94                .duper_browse
95                .as_ref()?
96                .root_stream
97                .as_ref()?
98                .current_open_item,
99        )
100    }
101
102    /// Get the [`Deviation`] for this page.
103    pub fn get_current_deviation(&self) -> Option<&Deviation> {
104        let id = self.get_current_deviation_id()?;
105        let id = match id {
106            serde_json::Value::Number(n) => n.as_u64()?,
107            serde_json::Value::String(s) => s.parse().ok()?,
108            _ => return None,
109        };
110        self.get_deviation_by_id(id)
111    }
112
113    /// Get the [`DeviationExtended`] for this page.
114    pub fn get_current_deviation_extended(&self) -> Option<&DeviationExtended> {
115        let id = self.get_current_deviation_id()?;
116        let mut key_buffer = itoa::Buffer::new();
117        let key = match id {
118            serde_json::Value::Number(n) => {
119                let n = n.as_u64()?;
120                key_buffer.format(n)
121            }
122            serde_json::Value::String(s) => s,
123            _ => return None,
124        };
125        self.entities
126            .as_ref()?
127            .deviation_extended
128            .as_ref()?
129            .get(key)
130    }
131
132    /// Get a deviation by id, if it exists
133    pub fn get_deviation_by_id(&self, id: u64) -> Option<&Deviation> {
134        let mut key_buffer = itoa::Buffer::new();
135        self.entities.as_ref()?.deviation.get(key_buffer.format(id))
136    }
137
138    /// Take a deviation by id, if it exists
139    pub fn take_deviation_by_id(&mut self, id: u64) -> Option<Deviation> {
140        let mut key_buffer = itoa::Buffer::new();
141        self.entities
142            .as_mut()?
143            .deviation
144            .remove(key_buffer.format(id))
145    }
146
147    /// Get the current folder id, if in a gallery.
148    pub fn get_current_folder_id(&self) -> Option<i64> {
149        Some(self.gallection_section.as_ref()?.selected_folder_id)
150    }
151
152    /// Get a stream for folder post ids, by folder id.
153    ///
154    /// This will return the deviation ids for the current folder.
155    pub fn get_folder_deviations_stream(&self, folder_id: i64) -> Option<&WithOffsetStream> {
156        let key = format!("folder-deviations-gallery-{folder_id}");
157
158        self.streams
159            .as_ref()?
160            .streams
161            .get(&key)?
162            .as_with_offset_stream()
163    }
164
165    /// Get a gallery folder entity by id
166    pub fn get_gallery_folder_entity(&self, folder_id: i64) -> Option<&GalleryFolder> {
167        self.entities
168            .as_ref()?
169            .gallery_folder
170            .as_ref()?
171            .get(itoa::Buffer::new().format(folder_id))
172    }
173
174    /// Get a user entity by id
175    pub fn get_user_entity(&self, user_id: u64) -> Option<&User> {
176        self.entities
177            .as_ref()?
178            .user
179            .as_ref()?
180            .get(itoa::Buffer::new().format(user_id))
181    }
182}
183
184/// ?
185#[derive(Debug, serde::Deserialize)]
186pub struct Config {
187    /// The page's csrf token
188    #[serde(rename = "csrfToken")]
189    pub csrf_token: String,
190
191    /// Unknown data
192    #[serde(flatten)]
193    pub unknown: HashMap<String, serde_json::Value>,
194}
195
196/// Page entities, like deviations, folders, and users.
197#[derive(Debug, serde::Deserialize)]
198pub struct Entities {
199    /// Deviations
200    pub deviation: HashMap<String, Deviation>,
201
202    /// Extended Deviation Info
203    #[serde(rename = "deviationExtended")]
204    pub deviation_extended: Option<HashMap<String, DeviationExtended>>,
205
206    /// Gallery folders
207    #[serde(rename = "galleryFolder")]
208    pub gallery_folder: Option<HashMap<String, GalleryFolder>>,
209
210    /// Users
211    pub user: Option<HashMap<String, User>>,
212
213    /// Unknown data
214    #[serde(flatten)]
215    pub unknown: HashMap<String, serde_json::Value>,
216}
217
218/// Extended Info about a deviation
219#[derive(Debug, serde::Deserialize)]
220pub struct DeviationExtended {
221    /// Download info
222    pub download: Option<Download>,
223
224    /// HTML description
225    pub description: Option<String>,
226
227    /// Other media for this deviation
228    #[serde(rename = "additionalMedia")]
229    pub additional_media: Option<Vec<AdditionalMedia>>,
230
231    /// The id of the deviation this belongs to.
232    #[serde(rename = "parentDeviationEntityId")]
233    pub parent_deviation_entity_id: u64,
234
235    /// Unknown data
236    #[serde(flatten)]
237    pub unknown: HashMap<String, serde_json::Value>,
238}
239
240impl DeviationExtended {
241    /// Check if additional media is downloadable.
242    ///
243    /// Older additionalMedia deviation images can be downloaded.
244    /// Newer ones cannot.
245    ///
246    /// See: https://github.com/mikf/gallery-dl/issues/6653#issuecomment-2816585744
247    pub fn can_download_additional_media(&self) -> bool {
248        // Return false if there is no additional media.
249        let additional_media = match self.additional_media.as_ref() {
250            Some(additional_media) => additional_media,
251            None => return false,
252        };
253
254        // Old deviations can always be downloaded.
255        // ID determined experimentally.
256        if self.parent_deviation_entity_id < 1184619292 {
257            return true;
258        }
259
260        // The second token is for downloads.
261        // If present, we can download.
262        additional_media
263            .iter()
264            .all(|entry| entry.media.token.len() > 1)
265    }
266}
267
268/// A gallery folder
269#[derive(Debug, serde::Deserialize)]
270pub struct GalleryFolder {
271    /// The folder id.
272    ///
273    /// For some reason, this can be -1 sometimes.
274    #[serde(rename = "folderId")]
275    pub folder_id: i64,
276
277    /// The name of the folder
278    pub name: String,
279
280    /// The user id of the owner of the folder
281    pub owner: u64,
282
283    /// Unknown data
284    #[serde(flatten)]
285    pub unknown: HashMap<String, serde_json::Value>,
286}
287
288/// A user
289#[derive(Debug, serde::Deserialize)]
290pub struct User {
291    /// The user id
292    #[serde(rename = "userId")]
293    pub user_id: u64,
294
295    /// The user name
296    pub username: String,
297
298    /// Unknown data
299    #[serde(flatten)]
300    pub unknown: HashMap<String, serde_json::Value>,
301}
302
303#[derive(Debug, serde::Deserialize)]
304pub struct Download {
305    /// The file size
306    pub filesize: u64,
307
308    /// The image height
309    pub height: u32,
310
311    /// The image width
312    pub width: u32,
313
314    /// ?
315    #[serde(rename = "type")]
316    pub kind: String,
317
318    /// The url
319    pub url: Url,
320
321    /// Unknown data
322    #[serde(flatten)]
323    pub unknown: HashMap<String, serde_json::Value>,
324}
325
326#[derive(Debug, serde::Deserialize)]
327pub struct AdditionalMedia {
328    /// Media info
329    pub media: Media,
330
331    /// Unknown data
332    #[serde(flatten)]
333    pub unknown: HashMap<String, serde_json::Value>,
334}
335
336/// ?
337#[derive(Debug, serde::Deserialize)]
338pub struct DuperBrowse {
339    /// ?
340    #[serde(rename = "rootStream")]
341    pub root_stream: Option<RootStream>,
342
343    /// Unknown data
344    #[serde(flatten)]
345    pub unknown: HashMap<String, serde_json::Value>,
346}
347
348/// ?
349#[derive(Debug, serde::Deserialize)]
350pub struct RootStream {
351    /// The id of the current deviation. This is either a number or string.
352    #[serde(rename = "currentOpenItem")]
353    pub current_open_item: serde_json::Value,
354
355    /// Unknown data
356    #[serde(flatten)]
357    pub unknown: HashMap<String, serde_json::Value>,
358}
359
360/// ?
361#[derive(Debug, serde::Deserialize)]
362pub struct PublicSession {
363    /// Whether the user is logged in
364    #[serde(rename = "isLoggedIn")]
365    pub is_logged_in: bool,
366
367    /// Unknown data
368    #[serde(flatten)]
369    pub unknown: HashMap<String, serde_json::Value>,
370}
371
372/// The streams field
373#[derive(Debug, serde::Deserialize)]
374pub struct Streams {
375    /// Search results appear here
376    #[serde(rename = "@@BROWSE_PAGE_STREAM")]
377    pub browse_page_stream: Option<BrowsePageStream>,
378
379    /// Extra data.
380    ///
381    /// This can include data whos purpose is known, like entries in a folder.
382    #[serde(flatten)]
383    pub streams: HashMap<String, Stream>,
384}
385
386/// ?
387#[derive(Debug, serde::Deserialize)]
388#[serde(tag = "streamType")]
389pub enum Stream {
390    #[serde(rename = "WITH_OFFSET")]
391    WithOffset(WithOffsetStream),
392
393    #[serde(untagged)]
394    Unknown(serde_json::Value),
395}
396
397impl Stream {
398    /// Get this as a WithOffset stream.
399    pub fn as_with_offset_stream(&self) -> Option<&WithOffsetStream> {
400        match self {
401            Self::WithOffset(stream) => Some(stream),
402            _ => None,
403        }
404    }
405}
406
407/// ?
408#[derive(Debug, serde::Deserialize)]
409pub struct WithOffsetStream {
410    /// Items in the stream?
411    pub items: Vec<u64>,
412
413    /// The # of items per fetch?
414    #[serde(rename = "itemsPerFetch")]
415    pub items_per_fetch: u32,
416
417    /// Has more entries?
418    #[serde(rename = "hasMore")]
419    pub has_more: bool,
420
421    /// ?
422    #[serde(rename = "hasLess")]
423    pub has_less: bool,
424
425    /// Unknown data
426    #[serde(flatten)]
427    pub unknown: HashMap<String, serde_json::Value>,
428}
429
430/// Search results appear here
431#[derive(Debug, serde::Deserialize)]
432pub struct BrowsePageStream {
433    /// The cursor
434    pub cursor: String,
435
436    /// Whether this has less?
437    #[serde(rename = "hasLess")]
438    pub has_less: bool,
439
440    /// Whether this has more?
441    #[serde(rename = "hasMore")]
442    pub has_more: bool,
443
444    /// Deviation ids?
445    ///
446    /// Usually, these are integers representing deviation ids.
447    /// In some cases, these are strings of the format "xx-nnnnn",
448    /// where the "xx" part is unknown and the "nnnnn" part is a deviation id.
449    pub items: Vec<serde_json::Value>,
450
451    /// The # of items per page
452    #[serde(rename = "itemsPerFetch")]
453    pub items_per_fetch: u64,
454
455    /// Stream Params
456    #[serde(rename = "streamParams")]
457    pub stream_params: StreamParams,
458
459    /// The stream type
460    #[serde(rename = "streamType")]
461    pub stream_type: String,
462
463    /// The stream id
464    #[serde(rename = "streamId")]
465    pub stream_id: String,
466
467    /// ?
468    #[serde(rename = "fetchNextCallback")]
469    pub fetch_next_callback: String,
470
471    /// Unknown data
472    #[serde(flatten)]
473    pub unknown: HashMap<String, serde_json::Value>,
474}
475
476/// Stream params
477#[derive(Debug, serde::Deserialize)]
478pub struct StreamParams {
479    /// Request params
480    #[serde(rename = "requestParams")]
481    pub request_params: HashMap<String, String>,
482
483    /// ?
484    #[serde(rename = "itemType")]
485    pub item_type: String,
486
487    /// ?
488    #[serde(rename = "requestEndpoint")]
489    pub request_endpoint: String,
490
491    /// ?
492    #[serde(rename = "initialOffset")]
493    pub initial_offset: u64,
494
495    /// Unknown data
496    #[serde(flatten)]
497    pub unknown: HashMap<String, serde_json::Value>,
498}
499
500/// Gallery selection info
501#[derive(Debug, serde::Deserialize)]
502pub struct GallectionSection {
503    /// The current page
504    #[serde(rename = "currentPage")]
505    pub page: u64,
506
507    /// The id of the selected folder
508    #[serde(rename = "selectedFolderId")]
509    pub selected_folder_id: i64,
510
511    /// The total number of pages
512    #[serde(rename = "totalPages")]
513    pub total_pages: u64,
514
515    /// Unknown data
516    #[serde(flatten)]
517    pub unknown: HashMap<String, serde_json::Value>,
518}
519
520#[cfg(test)]
521mod test {
522    use super::*;
523
524    const SCRAPED_WEBPAGE: &str = include_str!("../../test_data/scraped_webpage.json");
525    const LOGIN_WEBPAGE: &str = include_str!("../../test_data/login_webpage.json");
526
527    #[test]
528    fn parse_scraped_webpage() {
529        let scraped_webpage_info: ScrapedWebPageInfo =
530            serde_json::from_str(SCRAPED_WEBPAGE).expect("failed to parse scraped webpage info");
531        assert_eq!(
532            scraped_webpage_info
533                .get_current_deviation_id()
534                .expect("missing current deviation id"),
535            119577071
536        );
537        // dbg!(scraped_deviation_info.entities.deviation);
538    }
539
540    #[test]
541    fn parse_login_webpage() {
542        let _scraped_webpage_info: ScrapedWebPageInfo =
543            serde_json::from_str(LOGIN_WEBPAGE).expect("failed to parse scraped webpage info");
544    }
545}