Skip to content

Commit cef925f

Browse files
Fix Claude.ai parsing (#102)
* Update ClaudeExtractor to use 'font-claude-response' class and normalize content extraction * Add Claude.ai pattern to ExtractorRegistry and fix document reference in ConversationExtractor
1 parent 4f61265 commit cef925f

3 files changed

Lines changed: 8 additions & 4 deletions

File tree

src/extractor-registry.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ export class ExtractorRegistry {
6868

6969
this.register({
7070
patterns: [
71+
'claude.ai',
7172
/^https?:\/\/claude\.ai\/(chat|share)\/.*/
7273
],
7374
extractor: ClaudeExtractor

src/extractors/_conversation.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ export abstract class ConversationExtractor extends BaseExtractor {
1616
const rawContentHtml = this.createContentHtml(messages, footnotes);
1717

1818
// Create a temporary document to run Defuddle on our content
19-
const tempDoc = document.implementation.createHTMLDocument();
19+
const tempDoc = this.document.implementation.createHTMLDocument();
2020
const container = tempDoc.createElement('article');
2121
container.innerHTML = rawContentHtml;
2222
tempDoc.body.appendChild(container);

src/extractors/claude.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export class ClaudeExtractor extends ConversationExtractor {
77
constructor(document: Document, url: string) {
88
super(document, url);
99
// Find all message blocks - both user and assistant messages
10-
this.articles = document.querySelectorAll('div[data-testid="user-message"], div[data-testid="assistant-message"], div.font-claude-message');
10+
this.articles = document.querySelectorAll('div[data-testid="user-message"], div[data-testid="assistant-message"], div.font-claude-response');
1111
}
1212

1313
canExtract(): boolean {
@@ -33,16 +33,19 @@ export class ClaudeExtractor extends ConversationExtractor {
3333
else {
3434
return;
3535
}
36-
} else if (article.classList.contains('font-claude-message')) {
36+
} else if (article.classList.contains('font-claude-response')) {
3737
// Handle Claude messages
3838
role = 'assistant';
39-
content = article.innerHTML;
39+
const assistantBody = (article.querySelector('.standard-markdown') as HTMLElement) || (article as HTMLElement);
40+
content = assistantBody.innerHTML;
4041
} else {
4142
// Skip unknown elements
4243
return;
4344
}
4445

4546
if (content) {
47+
// Normalize content similar to ChatGPT extractor
48+
content = content.replace(/\u200B/g, '').replace(/<p[^>]*>\s*<\/p>/g, '');
4649
messages.push({
4750
author: role === 'you' ? 'You' : 'Claude',
4851
content: content.trim(),

0 commit comments

Comments
 (0)