Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 75 additions & 47 deletions src/__tests__/commands/crawl.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -324,93 +324,118 @@ describe('executeCrawl', () => {
});

describe('Wait mode (synchronous crawl)', () => {
it('should use crawl method with wait when wait flag is set', async () => {
const mockCrawlJob = {
id: '550e8400-e29b-41d4-a716-446655440000',
const jobId = '550e8400-e29b-41d4-a716-446655440000';

beforeEach(() => {
vi.useFakeTimers();
});

afterEach(() => {
vi.useRealTimers();
});

it('should use startCrawl + HTTP polling when wait flag is set (self-hosted compatible)', async () => {
const mockStartResponse = { id: jobId, url: 'https://example.com' };
const mockCompletedStatus = {
id: jobId,
status: 'completed',
total: 100,
completed: 100,
data: [{ markdown: '# Page 1' }],
};
mockClient.crawl.mockResolvedValue(mockCrawlJob);

const result = await executeCrawl({
mockClient.startCrawl.mockResolvedValue(mockStartResponse);
mockClient.getCrawlStatus.mockResolvedValue(mockCompletedStatus);

const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
pollInterval: 0.001,
});

expect(mockClient.crawl).toHaveBeenCalledTimes(1);
expect(mockClient.crawl).toHaveBeenCalledWith(
await vi.advanceTimersByTimeAsync(1);
const result = await crawlPromise;

expect(mockClient.startCrawl).toHaveBeenCalledTimes(1);
expect(mockClient.startCrawl).toHaveBeenCalledWith(
'https://example.com',
expect.objectContaining({
pollInterval: 5000, // Default poll interval
})
expect.objectContaining({ pollInterval: 1 })
);
expect(result).toEqual({
success: true,
data: mockCrawlJob,
});
expect(mockClient.crawl).not.toHaveBeenCalled();
expect(mockClient.getCrawlStatus).toHaveBeenCalledWith(jobId);
expect(result).toEqual({ success: true, data: mockCompletedStatus });
});

it('should include custom pollInterval when provided', async () => {
const mockCrawlJob = {
id: '550e8400-e29b-41d4-a716-446655440000',
it('should use default poll interval of 5000ms when not specified', async () => {
const mockStartResponse = { id: jobId, url: 'https://example.com' };
const mockCompletedStatus = {
id: jobId,
status: 'completed',
total: 100,
completed: 100,
total: 10,
completed: 10,
data: [],
};
mockClient.crawl.mockResolvedValue(mockCrawlJob);

await executeCrawl({
mockClient.startCrawl.mockResolvedValue(mockStartResponse);
mockClient.getCrawlStatus.mockResolvedValue(mockCompletedStatus);

const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
pollInterval: 10,
});

expect(mockClient.crawl).toHaveBeenCalledWith(
await vi.advanceTimersByTimeAsync(5000);
const result = await crawlPromise;

expect(mockClient.startCrawl).toHaveBeenCalledWith(
'https://example.com',
expect.objectContaining({
pollInterval: 10000, // Converted to milliseconds
})
expect.objectContaining({ pollInterval: 5000 })
);
expect(result.success).toBe(true);
});

it('should include timeout when provided', async () => {
const mockCrawlJob = {
id: '550e8400-e29b-41d4-a716-446655440000',
status: 'completed',
it('should return timeout error when crawl exceeds timeout', async () => {
const mockStartResponse = { id: jobId, url: 'https://example.com' };
const mockScrapingStatus = {
id: jobId,
status: 'scraping',
total: 100,
completed: 100,
completed: 10,
data: [],
};
mockClient.crawl.mockResolvedValue(mockCrawlJob);

await executeCrawl({
mockClient.startCrawl.mockResolvedValue(mockStartResponse);
mockClient.getCrawlStatus.mockResolvedValue(mockScrapingStatus);

const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
timeout: 300,
timeout: 1,
pollInterval: 0.001,
});

expect(mockClient.crawl).toHaveBeenCalledWith(
'https://example.com',
expect.objectContaining({
timeout: 300000, // Converted to milliseconds
})
);
// Advance past the timeout
await vi.advanceTimersByTimeAsync(2000);
const result = await crawlPromise;

expect(result.success).toBe(false);
expect(result.error).toMatch(/Timeout after 1 seconds/);
});

it('should combine wait options with crawl options', async () => {
const mockCrawlJob = {
id: '550e8400-e29b-41d4-a716-446655440000',
const mockStartResponse = { id: jobId, url: 'https://example.com' };
const mockCompletedStatus = {
id: jobId,
status: 'completed',
total: 50,
completed: 50,
data: [],
};
mockClient.crawl.mockResolvedValue(mockCrawlJob);

await executeCrawl({
mockClient.startCrawl.mockResolvedValue(mockStartResponse);
mockClient.getCrawlStatus.mockResolvedValue(mockCompletedStatus);

const crawlPromise = executeCrawl({
urlOrJobId: 'https://example.com',
wait: true,
pollInterval: 5,
Expand All @@ -419,7 +444,10 @@ describe('executeCrawl', () => {
maxDepth: 2,
});

expect(mockClient.crawl).toHaveBeenCalledWith(
await vi.advanceTimersByTimeAsync(5000);
await crawlPromise;

expect(mockClient.startCrawl).toHaveBeenCalledWith(
'https://example.com',
expect.objectContaining({
pollInterval: 5000,
Expand Down Expand Up @@ -526,9 +554,9 @@ describe('executeCrawl', () => {
});
});

it('should return error result when crawl fails', async () => {
it('should return error result when startCrawl fails in wait mode', async () => {
const errorMessage = 'Crawl timeout';
mockClient.crawl.mockRejectedValue(new Error(errorMessage));
mockClient.startCrawl.mockRejectedValue(new Error(errorMessage));

const result = await executeCrawl({
urlOrJobId: 'https://example.com',
Expand Down
38 changes: 32 additions & 6 deletions src/commands/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,38 @@ export async function executeCrawl(
}
}
} else {
// Use SDK's built-in polling (no progress display)
const crawlJob = await app.crawl(urlOrJobId, crawlOptions);
return {
success: true,
data: crawlJob,
};
// Use custom HTTP polling (compatible with self-hosted instances)
const response = await app.startCrawl(urlOrJobId, crawlOptions);
const jobId = response.id;

const pollMs = crawlOptions.pollInterval || 5000;
const startTime = Date.now();
const timeoutMs = timeout ? timeout * 1000 : undefined;

while (true) {
await new Promise((resolve) => setTimeout(resolve, pollMs));

const status = await app.getCrawlStatus(jobId);

if (
status.status === 'completed' ||
status.status === 'failed' ||
status.status === 'cancelled'
) {
return {
success: true,
data: status,
};
}

// Check timeout
if (timeoutMs && Date.now() - startTime > timeoutMs) {
return {
success: false,
error: `Timeout after ${timeout} seconds. Crawl still in progress.`,
};
}
}
}
}

Expand Down