deviantart/types/
scraped_webpage_info.rs

1use super::Deviation;
2use super::Media;
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashMap;
6use url::Url;
7
8/// An error that may occur while parsing a [`ScrapedWebPageInfo`] from a html string.
9#[derive(Debug, thiserror::Error)]
10pub enum FromHtmlStrError {
11    /// Missing the InitialState variable
12    #[error("missing initial state")]
13    MissingInitialState,
14
15    /// Failed to parse some state
16    #[error(transparent)]
17    InvalidJson(#[from] serde_json::Error),
18}
19
20/// Info scraped from a deviation url
21#[derive(Debug, serde::Deserialize)]
22pub struct ScrapedWebPageInfo {
23    /// Page config like csrf tokens
24    #[serde(rename = "@@config")]
25    pub config: Config,
26
27    /// Deviations extended deviations maybe?
28    #[serde(rename = "@@entities")]
29    pub entities: Option<Entities>,
30
31    /// ?
32    #[serde(rename = "@@DUPERBROWSE")]
33    pub duper_browse: Option<DuperBrowse>,
34
35    /// Info about the current session
36    #[serde(rename = "@@publicSession")]
37    pub public_session: PublicSession,
38
39    /// Streams
40    #[serde(rename = "@@streams")]
41    pub streams: Option<Streams>,
42
43    /// Needed for login.
44    ///
45    /// Note that this is a different csrf token from the config struct.
46    #[serde(rename = "csrfToken")]
47    pub csrf_token: Option<Box<str>>,
48
49    #[serde(rename = "gallectionSection")]
50    pub gallection_section: Option<GallectionSection>,
51
52    /// Needed for login.
53    #[serde(rename = "luToken")]
54    pub lu_token: Option<Box<str>>,
55
56    /// Needed for login.
57    #[serde(rename = "luToken2")]
58    pub lu_token2: Option<Box<str>>,
59
60    /// Unknown data
61    #[serde(flatten)]
62    pub unknown: HashMap<String, serde_json::Value>,
63}
64
65impl ScrapedWebPageInfo {
66    /// Parse this from a html string
67    pub fn from_html_str(input: &str) -> Result<Self, FromHtmlStrError> {
68        static REGEX: Lazy<Regex> = Lazy::new(|| {
69            Regex::new(r#"window\.__INITIAL_STATE__ = JSON\.parse\("(.*)"\);"#).unwrap()
70        });
71
72        let capture = REGEX
73            .captures(input)
74            .and_then(|captures| captures.get(1))
75            .ok_or(FromHtmlStrError::MissingInitialState)?;
76        // TODO: Escape properly
77        let capture = capture
78            .as_str()
79            .replace("\\\"", "\"")
80            .replace("\\'", "'")
81            .replace("\\\\", "\\");
82        Ok(serde_json::from_str(&capture)?)
83    }
84
85    /// Returns `true` if logged in
86    pub fn is_logged_in(&self) -> bool {
87        self.public_session.is_logged_in
88    }
89
90    /// Get the current deviation's id
91    pub fn get_current_deviation_id(&self) -> Option<&serde_json::Value> {
92        Some(
93            &self
94                .duper_browse
95                .as_ref()?
96                .root_stream
97                .as_ref()?
98                .current_open_item,
99        )
100    }
101
102    /// Get the [`Deviation`] for this page.
103    pub fn get_current_deviation(&self) -> Option<&Deviation> {
104        let id = self.get_current_deviation_id()?;
105        let id = match id {
106            serde_json::Value::Number(n) => n.as_u64()?,
107            serde_json::Value::String(s) => s.parse().ok()?,
108            _ => return None,
109        };
110        self.get_deviation_by_id(id)
111    }
112
113    /// Get the [`DeviationExtended`] for this page.
114    pub fn get_current_deviation_extended(&self) -> Option<&DeviationExtended> {
115        let id = self.get_current_deviation_id()?;
116        let mut key_buffer = itoa::Buffer::new();
117        let key = match id {
118            serde_json::Value::Number(n) => {
119                let n = n.as_u64()?;
120                key_buffer.format(n)
121            }
122            serde_json::Value::String(s) => s,
123            _ => return None,
124        };
125        self.entities
126            .as_ref()?
127            .deviation_extended
128            .as_ref()?
129            .get(key)
130    }
131
132    /// Get a deviation by id, if it exists
133    pub fn get_deviation_by_id(&self, id: u64) -> Option<&Deviation> {
134        let mut key_buffer = itoa::Buffer::new();
135        self.entities.as_ref()?.deviation.get(key_buffer.format(id))
136    }
137
138    /// Take a deviation by id, if it exists
139    pub fn take_deviation_by_id(&mut self, id: u64) -> Option<Deviation> {
140        let mut key_buffer = itoa::Buffer::new();
141        self.entities
142            .as_mut()?
143            .deviation
144            .remove(key_buffer.format(id))
145    }
146
147    /// Get the current folder id, if in a gallery.
148    pub fn get_current_folder_id(&self) -> Option<i64> {
149        Some(self.gallection_section.as_ref()?.selected_folder_id)
150    }
151
152    /// Get a stream for folder post ids, by folder id.
153    ///
154    /// This will return the deviation ids for the current folder.
155    pub fn get_folder_deviations_stream(&self, folder_id: i64) -> Option<&WithOffsetStream> {
156        let key = format!("folder-deviations-gallery-{folder_id}");
157
158        self.streams
159            .as_ref()?
160            .streams
161            .get(&key)?
162            .as_with_offset_stream()
163    }
164
165    /// Get a gallery folder entity by id
166    pub fn get_gallery_folder_entity(&self, folder_id: i64) -> Option<&GalleryFolder> {
167        self.entities
168            .as_ref()?
169            .gallery_folder
170            .as_ref()?
171            .get(itoa::Buffer::new().format(folder_id))
172    }
173
174    /// Get a user entity by id
175    pub fn get_user_entity(&self, user_id: u64) -> Option<&User> {
176        self.entities
177            .as_ref()?
178            .user
179            .as_ref()?
180            .get(itoa::Buffer::new().format(user_id))
181    }
182}
183
184/// ?
185#[derive(Debug, serde::Deserialize)]
186pub struct Config {
187    /// The page's csrf token
188    #[serde(rename = "csrfToken")]
189    pub csrf_token: String,
190
191    /// Unknown data
192    #[serde(flatten)]
193    pub unknown: HashMap<String, serde_json::Value>,
194}
195
196/// Page entities, like deviations, folders, and users.
197#[derive(Debug, serde::Deserialize)]
198pub struct Entities {
199    /// Deviations
200    pub deviation: HashMap<String, Deviation>,
201
202    /// Extended Deviation Info
203    #[serde(rename = "deviationExtended")]
204    pub deviation_extended: Option<HashMap<String, DeviationExtended>>,
205
206    /// Gallery folders
207    #[serde(rename = "galleryFolder")]
208    pub gallery_folder: Option<HashMap<String, GalleryFolder>>,
209
210    /// Users
211    pub user: Option<HashMap<String, User>>,
212
213    /// Unknown data
214    #[serde(flatten)]
215    pub unknown: HashMap<String, serde_json::Value>,
216}
217
218/// Extended Info about a deviation
219#[derive(Debug, serde::Deserialize)]
220pub struct DeviationExtended {
221    /// Download info
222    pub download: Option<Download>,
223
224    /// HTML description
225    pub description: Option<String>,
226
227    /// Other media for this deviation
228    #[serde(rename = "additionalMedia")]
229    pub additional_media: Option<Vec<AdditionalMedia>>,
230
231    /// Whether this image is protected.
232    #[serde(rename = "isDaProtected")]
233    pub is_da_protected: Option<bool>,
234
235    /// Unknown data
236    #[serde(flatten)]
237    pub unknown: HashMap<String, serde_json::Value>,
238}
239
240/// A gallery folder
241#[derive(Debug, serde::Deserialize)]
242pub struct GalleryFolder {
243    /// The folder id.
244    ///
245    /// For some reason, this can be -1 sometimes.
246    #[serde(rename = "folderId")]
247    pub folder_id: i64,
248
249    /// The name of the folder
250    pub name: String,
251
252    /// The user id of the owner of the folder
253    pub owner: u64,
254
255    /// Unknown data
256    #[serde(flatten)]
257    pub unknown: HashMap<String, serde_json::Value>,
258}
259
260/// A user
261#[derive(Debug, serde::Deserialize)]
262pub struct User {
263    /// The user id
264    #[serde(rename = "userId")]
265    pub user_id: u64,
266
267    /// The user name
268    pub username: String,
269
270    /// Unknown data
271    #[serde(flatten)]
272    pub unknown: HashMap<String, serde_json::Value>,
273}
274
275#[derive(Debug, serde::Deserialize)]
276pub struct Download {
277    /// The file size
278    pub filesize: u64,
279
280    /// The image height
281    pub height: u32,
282
283    /// The image width
284    pub width: u32,
285
286    /// ?
287    #[serde(rename = "type")]
288    pub kind: String,
289
290    /// The url
291    pub url: Url,
292
293    /// Unknown data
294    #[serde(flatten)]
295    pub unknown: HashMap<String, serde_json::Value>,
296}
297
298#[derive(Debug, serde::Deserialize)]
299pub struct AdditionalMedia {
300    /// Media info
301    pub media: Media,
302
303    /// Unknown data
304    #[serde(flatten)]
305    pub unknown: HashMap<String, serde_json::Value>,
306}
307
308/// ?
309#[derive(Debug, serde::Deserialize)]
310pub struct DuperBrowse {
311    /// ?
312    #[serde(rename = "rootStream")]
313    pub root_stream: Option<RootStream>,
314
315    /// Unknown data
316    #[serde(flatten)]
317    pub unknown: HashMap<String, serde_json::Value>,
318}
319
320/// ?
321#[derive(Debug, serde::Deserialize)]
322pub struct RootStream {
323    /// The id of the current deviation. This is either a number or string.
324    #[serde(rename = "currentOpenItem")]
325    pub current_open_item: serde_json::Value,
326
327    /// Unknown data
328    #[serde(flatten)]
329    pub unknown: HashMap<String, serde_json::Value>,
330}
331
332/// ?
333#[derive(Debug, serde::Deserialize)]
334pub struct PublicSession {
335    /// Whether the user is logged in
336    #[serde(rename = "isLoggedIn")]
337    pub is_logged_in: bool,
338
339    /// Unknown data
340    #[serde(flatten)]
341    pub unknown: HashMap<String, serde_json::Value>,
342}
343
344/// The streams field
345#[derive(Debug, serde::Deserialize)]
346pub struct Streams {
347    /// Search results appear here
348    #[serde(rename = "@@BROWSE_PAGE_STREAM")]
349    pub browse_page_stream: Option<BrowsePageStream>,
350
351    /// Extra data.
352    ///
353    /// This can include data whos purpose is known, like entries in a folder.
354    #[serde(flatten)]
355    pub streams: HashMap<String, Stream>,
356}
357
358/// ?
359#[derive(Debug, serde::Deserialize)]
360#[serde(tag = "streamType")]
361pub enum Stream {
362    #[serde(rename = "WITH_OFFSET")]
363    WithOffset(WithOffsetStream),
364
365    #[serde(untagged)]
366    Unknown(serde_json::Value),
367}
368
369impl Stream {
370    /// Get this as a WithOffset stream.
371    pub fn as_with_offset_stream(&self) -> Option<&WithOffsetStream> {
372        match self {
373            Self::WithOffset(stream) => Some(stream),
374            _ => None,
375        }
376    }
377}
378
379/// ?
380#[derive(Debug, serde::Deserialize)]
381pub struct WithOffsetStream {
382    /// Items in the stream?
383    pub items: Vec<u64>,
384
385    /// The # of items per fetch?
386    #[serde(rename = "itemsPerFetch")]
387    pub items_per_fetch: u32,
388
389    /// Has more entries?
390    #[serde(rename = "hasMore")]
391    pub has_more: bool,
392
393    /// ?
394    #[serde(rename = "hasLess")]
395    pub has_less: bool,
396
397    /// Unknown data
398    #[serde(flatten)]
399    pub unknown: HashMap<String, serde_json::Value>,
400}
401
402/// Search results appear here
403#[derive(Debug, serde::Deserialize)]
404pub struct BrowsePageStream {
405    /// The cursor
406    pub cursor: String,
407
408    /// Whether this has less?
409    #[serde(rename = "hasLess")]
410    pub has_less: bool,
411
412    /// Whether this has more?
413    #[serde(rename = "hasMore")]
414    pub has_more: bool,
415
416    /// Deviation ids?
417    ///
418    /// Usually, these are integers representing deviation ids.
419    /// In some cases, these are strings of the format "xx-nnnnn",
420    /// where the "xx" part is unknown and the "nnnnn" part is a deviation id.
421    pub items: Vec<serde_json::Value>,
422
423    /// The # of items per page
424    #[serde(rename = "itemsPerFetch")]
425    pub items_per_fetch: u64,
426
427    /// Stream Params
428    #[serde(rename = "streamParams")]
429    pub stream_params: StreamParams,
430
431    /// The stream type
432    #[serde(rename = "streamType")]
433    pub stream_type: String,
434
435    /// The stream id
436    #[serde(rename = "streamId")]
437    pub stream_id: String,
438
439    /// ?
440    #[serde(rename = "fetchNextCallback")]
441    pub fetch_next_callback: String,
442
443    /// Unknown data
444    #[serde(flatten)]
445    pub unknown: HashMap<String, serde_json::Value>,
446}
447
448/// Stream params
449#[derive(Debug, serde::Deserialize)]
450pub struct StreamParams {
451    /// Request params
452    #[serde(rename = "requestParams")]
453    pub request_params: HashMap<String, String>,
454
455    /// ?
456    #[serde(rename = "itemType")]
457    pub item_type: String,
458
459    /// ?
460    #[serde(rename = "requestEndpoint")]
461    pub request_endpoint: String,
462
463    /// ?
464    #[serde(rename = "initialOffset")]
465    pub initial_offset: u64,
466
467    /// Unknown data
468    #[serde(flatten)]
469    pub unknown: HashMap<String, serde_json::Value>,
470}
471
472/// Gallery selection info
473#[derive(Debug, serde::Deserialize)]
474pub struct GallectionSection {
475    /// The current page
476    #[serde(rename = "currentPage")]
477    pub page: u64,
478
479    /// The id of the selected folder
480    #[serde(rename = "selectedFolderId")]
481    pub selected_folder_id: i64,
482
483    /// The total number of pages
484    #[serde(rename = "totalPages")]
485    pub total_pages: u64,
486
487    /// Unknown data
488    #[serde(flatten)]
489    pub unknown: HashMap<String, serde_json::Value>,
490}
491
492#[cfg(test)]
493mod test {
494    use super::*;
495
496    const SCRAPED_WEBPAGE: &str = include_str!("../../test_data/scraped_webpage.json");
497    const LOGIN_WEBPAGE: &str = include_str!("../../test_data/login_webpage.json");
498
499    #[test]
500    fn parse_scraped_webpage() {
501        let scraped_webpage_info: ScrapedWebPageInfo =
502            serde_json::from_str(SCRAPED_WEBPAGE).expect("failed to parse scraped webpage info");
503        assert_eq!(
504            scraped_webpage_info
505                .get_current_deviation_id()
506                .expect("missing current deviation id"),
507            119577071
508        );
509        // dbg!(scraped_deviation_info.entities.deviation);
510    }
511
512    #[test]
513    fn parse_login_webpage() {
514        let _scraped_webpage_info: ScrapedWebPageInfo =
515            serde_json::from_str(LOGIN_WEBPAGE).expect("failed to parse scraped webpage info");
516    }
517}