@@ -77,46 +77,57 @@ def extract_cover_image(self):
7777 def extract_authors (self ) -> List [Dict [str , str ]]:
7878 try :
7979 authors = []
80- author_blocks = self .soup .select ("#tab-2 .autor-wrapper" )
80+ author_blocks = self .soup .select (".autor-wrapper" )
81+ if not author_blocks :
82+ author_blocks = self .soup .select ("#tab-2 .autor-wrapper" )
83+
84+ logger .info (f"found { len (author_blocks )} author blocks" )
8185
8286 for block in author_blocks :
8387 name_tag = block .select_one ("h2" )
8488 if name_tag :
8589 full_name = name_tag .get_text (strip = True )
86- parts = full_name .split ()
87- if len (parts ) == 1 :
88- last_name = parts [0 ]
89- first_name = ""
90- elif len (parts ) == 2 :
91- last_name , first_name = parts
92- elif len (parts ) >= 3 :
93- first_name = parts [1 ]
94- last_name = " " .join ([parts [0 ]] + parts [2 :])
90+ else :
91+ name_element = block .find (["h1" , "h3" , "h4" , "h5" , "h6" ])
92+ if name_element :
93+ full_name = name_element .get_text (strip = True )
9594 else :
96- last_name = ""
97- first_name = ""
98- logger .warning (f"empty author name: { full_name } " )
99-
100- description_block = name_tag .parent
101- bio_parts = []
102- for bio in description_block .contents :
103- if bio != name_tag and isinstance (bio , str ):
104- bio_parts .append (bio .strip ())
105- bio = " " .join (bio_parts ).strip ()
106-
107- authors .append (
108- {
109- "first_name" : first_name .strip ("." ),
110- "last_name" : last_name ,
111- "bio" : bio ,
112- }
113- )
114-
115- logger .info (f"parsed { len (authors )} authors from tab-2" )
95+ full_name = (
96+ block .get_text (strip = True ).split ("\n " )[0 ]
97+ if block .get_text (strip = True )
98+ else ""
99+ )
100+
101+ if not full_name :
102+ continue
103+
104+ parts = full_name .split ()
105+ if len (parts ) >= 2 :
106+ first_name = parts [0 ]
107+ last_name = " " .join (parts [1 :])
108+ elif len (parts ) == 1 :
109+ first_name = ""
110+ last_name = parts [0 ]
111+ else :
112+ first_name = ""
113+ last_name = ""
114+
115+ full_text = block .get_text (separator = " " , strip = True )
116+ bio = full_text .replace (full_name , "" , 1 ).strip ()
117+
118+ authors .append (
119+ {
120+ "first_name" : first_name .strip ("." ),
121+ "last_name" : last_name ,
122+ "bio" : bio ,
123+ }
124+ )
125+
126+ logger .info (f"parsed { len (authors )} authors" )
116127 return authors
117128 except Exception as e :
118- logger .error (f"failed to parse authors from tab-2 : { str (e )} " )
119- logger .exception ("tab-2 author parsing error details" )
129+ logger .error (f"failed to parse authors: { str (e )} " )
130+ logger .exception ("author parsing error details" )
120131 return []
121132
122133 def extract_author_bio (self ) -> str :
0 commit comments