22import json
33import uuid
44from dataclasses import dataclass , field
5- from typing import Dict , List , Optional , Union
5+ from typing import Dict , List , Optional , Union , Literal
66from urllib .parse import urljoin , urlparse
7+ from datetime import datetime
78
89from .form import build_form
910from .options import AnyparserOption
@@ -36,22 +37,22 @@ class AnyparserResultBase:
3637class AnyparserCrawlDirectiveBase :
3738 """Represents Anyparser crawl directive base with type, priority, name, noindex, nofollow, and crawl delay."""
3839
39- type : str = field (default = "" )
40+ type : Literal [ "HTTP Header" , "HTML Meta" , "Combined" ] = field (default = "Combined " )
4041 priority : int = field (default = 0 )
4142 name : Optional [str ] = field (default = None )
4243 noindex : Optional [bool ] = field (default = False )
4344 nofollow : Optional [bool ] = field (default = False )
4445 crawl_delay : Optional [int ] = field (default = None )
46+ unavailable_after : Optional [datetime ] = field (default = None )
4547
4648
4749@dataclass
4850class AnyparserCrawlDirective (AnyparserCrawlDirectiveBase ):
4951 """Represents Anyparser crawl directive with type 'Combined', overriding the name to be None and adding the 'underlying' field."""
5052
5153 underlying : List [AnyparserCrawlDirectiveBase ] = field (default_factory = list )
52- type : str = field (default = "Combined" )
54+ type : Literal [ "Combined" ] = field (default = "Combined" )
5355 name : Optional [None ] = field (default = None )
54- # name: None = None
5556
5657
5758@dataclass
@@ -74,11 +75,11 @@ class AnyparserUrl:
7475 politeness_delay : int = field (default = 0 )
7576 total_characters : int = field (default = 0 )
7677 markdown : str = field (default = "" )
77-
7878 directive : AnyparserCrawlDirective = field (default_factory = AnyparserCrawlDirective )
7979 title : Optional [str ] = field (default = None )
8080 crawled_at : Optional [str ] = field (default = None )
81-
81+ images : List [AnyparserImageReference ] = field (default_factory = list )
82+ text : Optional [str ] = field (default = None )
8283
8384@dataclass
8485class AnyparserPdfPage :
@@ -111,11 +112,11 @@ class AnyparserCrawlResult:
111112 robots_directive : AnyparserRobotsTxtDirective
112113
113114
114- AnyparserResult = Union [AnyparserPdfResult , AnyparserCrawlResult ]
115+ AnyparserResult = Union [AnyparserPdfResult , AnyparserCrawlResult , AnyparserResultBase ]
115116
116117
117118class Anyparser :
118- """Main class for parsing itemss using the AnyParser API."""
119+ """Main class for parsing itemss using the Anyparser API."""
119120
120121 def __init__ (self , options : Optional [AnyparserOption ] = None ) -> None :
121122 """Initialize the parser with optional configuration.
@@ -128,7 +129,7 @@ def __init__(self, options: Optional[AnyparserOption] = None) -> None:
128129 async def parse (
129130 self , file_paths_or_url : Union [str , List [str ]]
130131 ) -> Union [List [AnyparserResult ], str ]:
131- """Parse files using the AnyParser API.
132+ """Parse files using the Anyparser API.
132133
133134 Args:
134135 file_paths_or_url: A single file path or list of file paths to parse, or a start URL for crawling
0 commit comments