File size: 12,294 Bytes
d2e7d62
 
 
 
 
04b34c7
d2e7d62
04b34c7
 
 
8e5c2f5
 
 
d2e7d62
 
04b34c7
 
ae1a622
04b34c7
 
2c52ca5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04b34c7
 
 
 
 
 
 
 
 
 
ae1a622
04b34c7
 
 
 
8e5c2f5
87a673a
8e5c2f5
87a673a
 
 
 
 
 
 
8f27e70
 
87a673a
 
 
 
 
 
8e5c2f5
87a673a
8e5c2f5
 
 
d2e7d62
04b34c7
 
 
 
 
 
8e5c2f5
 
04b34c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbfb380
 
 
 
 
 
8e5c2f5
 
fbfb380
 
 
8e5c2f5
 
fbfb380
8e5c2f5
 
 
 
fbfb380
 
 
8e5c2f5
fbfb380
8e5c2f5
 
904380f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e5c2f5
 
 
 
 
904380f
 
9ef0cf3
8e5c2f5
 
 
 
 
 
 
 
 
 
87a673a
8e5c2f5
 
 
 
 
 
 
 
87a673a
 
8e5c2f5
87a673a
8e5c2f5
 
 
87a673a
8e5c2f5
 
 
04b34c7
 
 
 
 
 
 
ae1a622
 
04b34c7
 
 
 
 
ae1a622
04b34c7
904380f
1d2af92
 
04b34c7
 
 
 
 
 
ae1a622
04b34c7
 
 
2c52ca5
8e5c2f5
2c52ca5
 
 
8e5c2f5
 
 
 
 
 
 
 
 
 
 
 
 
 
87a673a
 
 
 
8f27e70
87a673a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5257596
87a673a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f27e70
87a673a
 
 
8f27e70
87a673a
 
349dbe4
87a673a
 
 
 
 
 
 
 
 
 
 
 
349dbe4
87a673a
 
 
 
 
349dbe4
87a673a
 
349dbe4
87a673a
 
 
 
349dbe4
87a673a
 
 
349dbe4
87a673a
349dbe4
87a673a
 
 
 
349dbe4
87a673a
 
 
349dbe4
 
87a673a
 
 
8f27e70
 
87a673a
2c52ca5
 
04b34c7
 
 
 
 
 
d2e7d62
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
<!doctype html>
<html>
	<head>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width" />
		<title>Parquet Visualization Studio</title>
		<link rel="stylesheet" href="style.css" />
		<script src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-mvp.wasm.js"></script>
		<script src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-browser-mvp.worker.js"></script>
		<script type="module" src="https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/dist/duckdb-browser-mvp.worker.js"></script>
		<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
		<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
		<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
	</head>
	<body>
		<div class="container">
			<h1>πŸ“Š Parquet Visualization Studio</h1>
			<p class="subtitle">Visualize parquet files with interactive charts</p>

			<form id="queryForm">
				<div class="form-group">
					<label for="urlSelect">Select Example Dataset</label>
					<select id="urlSelect">
						<option value="">-- Choose a dataset or enter custom URL below --</option>
						<option value="https://huggingface.co/datasets/PleIAs/SYNTH/resolve/refs%2Fconvert%2Fparquet/default/partial-train/0000.parquet">PleIAs/SYNTH</option>
						<option value="https://huggingface.co/datasets/facebook/omnilingual-asr-corpus/resolve/refs%2Fconvert%2Fparquet/gby_Latn/train/0000.parquet">facebook/omnilingual-asr-corpus</option>
						<option value="https://example.com/dataset3.parquet">Dataset 3</option>
						<option value="https://example.com/dataset4.parquet">Dataset 4</option>
						<option value="https://example.com/dataset5.parquet">Dataset 5</option>
						<option value="https://example.com/dataset6.parquet">Dataset 6</option>
						<option value="https://example.com/dataset7.parquet">Dataset 7</option>
						<option value="https://example.com/dataset8.parquet">Dataset 8</option>
						<option value="https://example.com/dataset9.parquet">Dataset 9</option>
						<option value="https://example.com/dataset10.parquet">Dataset 10</option>
					</select>
				</div>

				<div class="form-group">
					<label for="parquetUrl">Parquet File URL</label>
					<input
						type="text"
						id="parquetUrl"
						placeholder="https://example.com/data.parquet"
						required
					/>
				</div>

				<button type="submit" id="submitBtn">Load Dataset</button>
			</form>

			<div id="status" class="status"></div>

			<div id="visualizationSection" class="visualization-section" style="display: none;">
				<h2>Create Visualization</h2>
				<div class="form-group">
					<label for="hfToken">Hugging Face Token (required for LLM)</label>
					<input
						type="password"
						id="hfToken"
						placeholder="Enter your HF token with Inference Providers permission"
					/>
					<small>Get a token from <a href="https://huggingface.co/settings/tokens" target="_blank">HF Settings</a> with "Make calls to Inference Providers" permission</small>
				</div>
				<div class="form-group">
					<label for="vizPrompt">Describe the visualization you want</label>
					<textarea
						id="vizPrompt"
						rows="3"
						placeholder="e.g., Show a scatter plot of price vs quantity, Create a bar chart showing count by category..."
					></textarea>
				</div>
				<button type="button" id="generateVizBtn">Generate Visualization</button>
				<div id="vizContainer" class="viz-container"></div>
			</div>

		</div>

		<script type="module">
			import * as duckdb from 'https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@latest/+esm';

			let db = null;
			let conn = null;
			let currentDatasetUrl = null;
			let columnInfo = [];

			// Initialize DuckDB
			async function initDuckDB() {
				const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles();
				const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES);
				const worker_url = URL.createObjectURL(
					new Blob([`importScripts("${bundle.mainWorker}");`], { type: 'text/javascript' })
				);
				const worker = new Worker(worker_url);
				const logger = new duckdb.ConsoleLogger();
				db = new duckdb.AsyncDuckDB(logger, worker);
				await db.instantiate(bundle.mainModule, bundle.pthreadWorker);
				URL.revokeObjectURL(worker_url);
				conn = await db.connect();
			}

			// Update status message
			function setStatus(message, type = 'info') {
				const statusEl = document.getElementById('status');
				statusEl.textContent = message;
				statusEl.className = `status status-${type}`;
				statusEl.style.display = 'block';
			}

			// Determine if a DuckDB type is a complex type (struct, list, map, etc.)
			function isComplexType(type) {
				const complexTypes = ['STRUCT', 'LIST', 'MAP', 'UNION', 'ARRAY'];
				return complexTypes.some(t => type.toUpperCase().startsWith(t));
			}

			// Determine if a DuckDB type is numeric
			function isNumericType(type) {
				// First check if it's a complex type
				if (isComplexType(type)) return false;

				const numericTypes = ['TINYINT', 'SMALLINT', 'INTEGER', 'BIGINT', 'HUGEINT',
									  'FLOAT', 'DOUBLE', 'DECIMAL', 'NUMERIC', 'REAL'];
				return numericTypes.some(t => type.toUpperCase().startsWith(t));
			}

			// Determine if a DuckDB type is text
			function isTextType(type) {
				// First check if it's a complex type
				if (isComplexType(type)) return false;

				const textTypes = ['VARCHAR', 'CHAR', 'TEXT', 'STRING'];
				return textTypes.some(t => type.toUpperCase().startsWith(t));
			}

			// Load dataset: initialize DuckDB, drop old file, and register new parquet file
			async function loadDataset(url) {
				// Initialize DuckDB if not already done
				if (!db) {
					await initDuckDB();
				}

				// Drop existing file registration if it exists
				try {
					await db.dropFile('data.parquet');
				} catch {}

				// Register the parquet file from URL
				await db.registerFileURL(
					'data.parquet',
					url,
					duckdb.DuckDBDataProtocol.HTTP,
					false
				);
			}

			// Detect columns and their types from the dataset
			async function detectColumns(url) {
				try {
					setStatus('Detecting column types...', 'info');

					// Load the dataset
					await loadDataset(url);

					// Query to get column information
					const result = await conn.query("DESCRIBE 'data.parquet'");
					const rows = result.toArray();

					columnInfo = rows.map(row => ({
						name: row.column_name,
						type: row.column_type
					}));

					setStatus(`Detected ${columnInfo.length} columns`, 'success');
					showVisualizationSection();

				} catch (error) {
					console.error('Error detecting columns:', error);
					setStatus(`Error detecting columns: ${error.message}`, 'error');
					columnInfo = [];
				}
			}

			// Show visualization section after dataset is loaded
			function showVisualizationSection() {
				const vizSection = document.getElementById('visualizationSection');
				if (columnInfo.length > 0) {
					vizSection.style.display = 'block';
				} else {
					vizSection.style.display = 'none';
					setStatus('No columns found in dataset', 'error');
				}
			}

			// Handle form submission
			async function handleSubmit(e) {
				e.preventDefault();

				const parquetUrl = document.getElementById('parquetUrl').value.trim();
				const submitBtn = document.getElementById('submitBtn');

				if (!parquetUrl) {
					setStatus('Please provide a parquet URL.', 'error');
					return;
				}

				try {
					submitBtn.disabled = true;
					submitBtn.textContent = 'Loading...';

					// Load dataset and detect columns
					currentDatasetUrl = parquetUrl;
					await detectColumns(parquetUrl);

				} catch (error) {
					console.error('Error:', error);
					setStatus(`Error: ${error.message}`, 'error');
				} finally {
					submitBtn.disabled = false;
					submitBtn.textContent = 'Load Dataset';
				}
			}

			// Handle dropdown selection
			document.getElementById('urlSelect').addEventListener('change', async function(e) {
				const selectedUrl = e.target.value;
				if (selectedUrl) {
					document.getElementById('parquetUrl').value = selectedUrl;
					currentDatasetUrl = selectedUrl;
					await detectColumns(selectedUrl);
				}
			});

			// Handle manual URL input (detect when user blurs or presses enter)
			document.getElementById('parquetUrl').addEventListener('blur', async function(e) {
				const url = e.target.value.trim();
				if (url && url !== currentDatasetUrl) {
					currentDatasetUrl = url;
					await detectColumns(url);
				}
			});

			// Generate Vega-Lite spec using LLM
			async function generateVisualization(prompt, hfToken) {
				const vizContainer = document.getElementById('vizContainer');
				vizContainer.innerHTML = '';

				try {
					setStatus('Generating visualization with LLM...', 'info');

					// Prepare column information for the LLM
					const columnDescriptions = columnInfo.map(col => `- ${col.name}: ${col.type}`).join('\n');

					// Create system prompt
					const systemPrompt = `You are a data visualization assistant that generates Vega-Lite specifications.

Available dataset columns:
${columnDescriptions}

Instructions:
1. Generate a valid Vega-Lite v5 specification based on the user's request
2. Use ONLY columns that exist in the dataset above
3. The data will be provided as an array of objects in the "data.values" field
4. Output ONLY the JSON specification, no explanations or markdown
5. Do not include the data itself, just reference fields by name
6. Include appropriate width and height (e.g., 600x400)
7. Make sure the spec is complete and valid

Output only the JSON spec starting with { and ending with }.`;

					// Call HF Inference API
					const response = await fetch(
						"/static-proxy?url=https%3A%2F%2Frouter.huggingface.co%2Fv1%2Fchat%2Fcompletions%26quot%3B%3C%2Fspan%3E%2C%3C%2Fspan%3E
						{
							method: "POST",
							headers: {
								Authorization: `Bearer ${hfToken}`,
								"Content-Type": "application/json",
							},
							body: JSON.stringify({
								model: "deepseek-ai/DeepSeek-R1",
								messages: [
									{
										role: "system",
										content: systemPrompt
									},
									{
										role: "user",
										content: prompt
									}
								],
								temperature: 0.7,
								max_tokens: 2000
							}),
						}
					);

					if (!response.ok) {
						throw new Error(`API request failed: ${response.status} ${response.statusText}`);
					}

					const data = await response.json();
					const vegaSpec = data.choices[0].message.content;

					// Parse and validate the Vega-Lite spec
					let spec;
					try {
						// Try to extract JSON if wrapped in markdown code blocks
						let jsonStr = vegaSpec.trim();
						if (jsonStr.startsWith('```')) {
							jsonStr = jsonStr.replace(/```json\n?/g, '').replace(/```\n?/g, '');
						}
						spec = JSON.parse(jsonStr);
					} catch (e) {
						throw new Error(`Failed to parse LLM response as JSON: ${e.message}`);
					}

					// Fetch data for the visualization
					setStatus('Fetching data for visualization...', 'info');
					const query = `SELECT * FROM 'data.parquet' LIMIT 1000`;
					const result = await conn.query(query);
					const dataArray = result.toArray();

					// Inject data into the spec
					spec.data = { values: dataArray };

					// Render the visualization
					setStatus('Rendering visualization...', 'info');
					await vegaEmbed('#vizContainer', spec);
					setStatus('Visualization generated successfully!', 'success');

				} catch (error) {
					console.error('Error generating visualization:', error);
					setStatus(`Error: ${error.message}`, 'error');
				}
			}

			// Handle generate visualization button
			document.getElementById('generateVizBtn').addEventListener('click', async function() {
				const prompt = document.getElementById('vizPrompt').value.trim();
				const hfToken = document.getElementById('hfToken').value.trim();

				if (!prompt) {
					setStatus('Please enter a visualization prompt', 'error');
					return;
				}

				if (!hfToken) {
					setStatus('Please enter your Hugging Face token', 'error');
					return;
				}

				await generateVisualization(prompt, hfToken);
			});

			// Set up event listeners
			document.getElementById('queryForm').addEventListener('submit', handleSubmit);

			// Initialize on load
			setStatus('Ready to query parquet files!', 'success');
		</script>
	</body>
</html>