-
-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathCargo.toml
More file actions
161 lines (134 loc) · 5.34 KB
/
Copy pathCargo.toml
File metadata and controls
161 lines (134 loc) · 5.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
[workspace]
members = [
"crates/crw-core",
"crates/crw-diff",
"crates/crw-renderer",
"crates/crw-extract",
"crates/crw-crawl",
"crates/crw-search",
"crates/crw-server",
"crates/crw-mcp",
"crates/crw-mcp-proto",
"crates/crw-browse",
"crates/crw-cli",
"crates/crw-monitor",
]
resolver = "2"
[workspace.package]
version = "0.16.0"
edition = "2024"
license = "AGPL-3.0"
repository = "https://github.com/us/crw"
homepage = "https://us.github.io/crw"
keywords = ["web-scraper", "web-crawler", "firecrawl", "mcp", "llm"]
categories = ["web-programming", "command-line-utilities"]
[workspace.dependencies]
# Async runtime
tokio = { version = "1", features = ["full"] }
# Web framework
axum = { version = "0.8", features = ["macros", "multipart"] }
tower = "0.5"
tower-http = { version = "0.6", features = ["cors", "trace", "timeout", "set-header"] }
# HTTP client
reqwest = { version = "0.13", default-features = false, features = ["json", "cookies", "rustls", "gzip", "brotli", "deflate", "socks", "stream", "multipart"] }
# Serialization
serde = { version = "1", features = ["derive"] }
serde_json = "1"
toml = "0.8"
# HTML processing
lol_html = "2"
scraper = "0.25"
htmd = "0.5"
ego-tree = "0.10"
xxhash-rust = { version = "0.8", features = ["xxh3"] }
once_cell = "1"
# Text processing
regex = "1"
# PDF parsing: pure-Rust (lopdf-based) PDF -> markdown conversion. Optional
# dep of `crw-extract` only, behind its default-on `pdf` feature. No system
# libraries — preserves the single-static-binary story. The open-core
# boundary gate asserts lopdf/ttf-parser are ABSENT when `pdf` is disabled.
pdf-inspector = "0.1"
# Diffing (change-tracking / monitor). Myers diff over lines; the parse-diff
# AST and the unified text surface are both derived from its op stream.
similar = "2"
# Randomness
rand = "0.9"
# Async utilities
futures = "0.3"
dashmap = "6"
# WebSocket client (shared by crw-renderer and crw-browse)
tokio-tungstenite = { version = "0.28", features = ["rustls-tls-native-roots"] }
# JSON schema generation (used by rmcp #[tool] macro)
schemars = "1.0"
# Config
config = "0.15"
# Error handling
thiserror = "2"
# Logging
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
# CLI
clap = { version = "4", features = ["derive", "env"] }
# Metrics
prometheus = "0.13"
# Caching (host preferences)
moka = { version = "0.12", features = ["future"] }
publicsuffix = "2"
# Misc
uuid = { version = "1", features = ["v4", "serde"] }
url = { version = "2", features = ["serde"] }
sha2 = "0.10"
hex = "0.4"
base64 = "0.22"
# Self-host monitor mode (feature-gated, default OFF). These are optional deps
# of `crw-monitor` only and MUST NOT leak into the default `crw-server` build —
# the open-core boundary gate (`cargo tree -p crw-server`) asserts their absence.
# NOTE: the monitor scheduler is a self-contained tokio sleep-loop (UTC cron +
# fixed-interval parser in `crw-monitor::schedule`), so no external cron crate is
# pulled in — simpler and keeps the dependency surface (and the open-core tree)
# minimal. `tokio-cron-scheduler` was evaluated and intentionally not adopted.
rusqlite = { version = "0.32", features = ["bundled"] }
hmac = "0.12"
# Unix process-group kill (browser teardown). Unix-only; already present
# transitively. Used by crw-renderer's BROWSER_PGIDS group-kill registry.
libc = "0.2"
# Compile-time perfect-hash sets (url_filter deny-lists)
phf = { version = "0.11", features = ["macros"] }
# Test dependencies
tokio-test = "0.4"
wiremock = "0.6"
axum-test = "19"
proptest = "1"
insta = { version = "1", features = ["json"] }
# Internal crates. Centralized here so the crate-to-crate version pins live in
# ONE place: release-please bumps these via release-please-config.json
# ($.workspace.dependencies.<crate>.version) in lockstep with
# [workspace.package].version. Member crates inherit with `{ workspace = true }`
# and layer their own `features` / `optional` on top. `path` is resolved
# relative to THIS (root) manifest. Only crates that are depended upon by
# another crate appear here; leaf binaries (crw-mcp, crw-cli) just consume.
crw-mcp-proto = { path = "crates/crw-mcp-proto", version = "0.16.0" }
crw-core = { path = "crates/crw-core", version = "0.16.0" }
crw-extract = { path = "crates/crw-extract", version = "0.16.0" }
crw-renderer = { path = "crates/crw-renderer", version = "0.16.0" }
crw-search = { path = "crates/crw-search", version = "0.16.0" }
crw-diff = { path = "crates/crw-diff", version = "0.16.0" }
crw-crawl = { path = "crates/crw-crawl", version = "0.16.0" }
crw-monitor = { path = "crates/crw-monitor", version = "0.16.0" }
crw-server = { path = "crates/crw-server", version = "0.16.0" }
crw-browse = { path = "crates/crw-browse", version = "0.16.0" }
[profile.release]
lto = true
codegen-units = 1
strip = true
# Size-optimized profile for the lean proxy MCP binary. Inherits release (lto +
# codegen-units=1 + strip) and adds `opt-level = "z"` to minimize size. Scoped to
# its own profile so it does NOT slow the CPU-bound scraping engine in the normal
# release build. NO `panic = "abort"` — crw-extract relies on `catch_unwind` to
# survive malformed-PDF parser panics, which abort would turn into process crashes.
# Build the lean binary with:
# cargo build --profile release-small --no-default-features -p crw-mcp
[profile.release-small]
inherits = "release"
opt-level = "z"