vault_audit_tools/commands/
preprocess_entities.rs

1//! Entity mapping preprocessor.
2//!
3//! ⚠️ **DEPRECATED**: Use `entity-analysis preprocess` instead.
4//!
5//! ```bash
6//! # Old (deprecated):
7//! vault-audit preprocess-entities logs/*.log --output mappings.json
8//!
9//! # New (recommended):
10//! vault-audit entity-analysis preprocess logs/*.log --output mappings.json
11//! ```
12//!
13//! **Note**: Most commands now auto-preprocess entity mappings, so this is rarely needed!
14//!
15//! See [`entity_analysis`](crate::commands::entity_analysis) for the unified command.
16//!
17//! ---
18//!
19//! Extracts entity-to-alias mappings from audit logs and exports to JSON or CSV,
20//! creating a baseline for subsequent entity analysis.
21//! Supports multi-file processing for comprehensive entity mapping.
22//!
23//! # Usage
24//!
25//! ```bash
26//! # Single file preprocessing (JSON default)
27//! vault-audit preprocess-entities audit.log --output entity-mappings.json
28//!
29//! # Multi-day comprehensive mapping (CSV)
30//! vault-audit preprocess-entities logs/*.log --output entity-mappings.csv --format csv
31//!
32//! # JSON format for entity-creation command
33//! vault-audit preprocess-entities logs/*.log --output entity-mappings.json --format json
34//! ```
35//!
36//! # Output
37//!
38//! Generates JSON or CSV containing:
39//! - Entity ID
40//! - Display name
41//! - Mount path and accessor
42//! - Username (if available)
43//! - Login count
44//! - First and last seen timestamps
45//!
46//! This output can be used as a baseline for:
47//! - `entity-creation` command (accepts both CSV and JSON)
48//! - `client-activity` command (JSON format)
49//! - External analysis tools
50//! - Historical trending
51
52use crate::audit::types::AuditEntry;
53use crate::utils::progress::ProgressBar;
54use crate::utils::reader::open_file;
55use anyhow::{Context, Result};
56use serde::{Deserialize, Serialize};
57use std::collections::HashMap;
58use std::fs::File;
59use std::io::{BufRead, BufReader, Write};
60
61/// Entity mapping with login statistics
62#[derive(Debug, Serialize, Deserialize)]
63pub struct EntityMapping {
64    pub display_name: String,
65    pub mount_path: String,
66    pub mount_accessor: String,
67    #[serde(skip_serializing_if = "Option::is_none")]
68    pub username: Option<String>,
69    pub login_count: usize,
70    pub first_seen: String,
71    pub last_seen: String,
72}
73
74/// Build entity mappings from audit logs without writing to file.
75/// Returns `HashMap` of `entity_id` -> `EntityMapping` for reuse by other commands.
76pub fn build_entity_map(log_files: &[String]) -> Result<HashMap<String, EntityMapping>> {
77    let mut entity_map: HashMap<String, EntityMapping> = HashMap::new();
78    let mut login_events = 0;
79    let mut lines_processed = 0;
80
81    // Process each log file sequentially
82    for (file_idx, log_file) in log_files.iter().enumerate() {
83        eprintln!(
84            "[{}/{}] Processing: {}",
85            file_idx + 1,
86            log_files.len(),
87            log_file
88        );
89
90        // Count lines in file first for accurate progress tracking
91        eprintln!("Scanning file to determine total lines...");
92        let total_file_lines = crate::utils::parallel::count_file_lines(log_file)?;
93
94        let file = open_file(log_file)
95            .with_context(|| format!("Failed to open audit log file: {}", log_file))?;
96        let reader = BufReader::new(file);
97
98        let progress = ProgressBar::new(total_file_lines, "Processing");
99        let mut file_lines = 0;
100
101        for line in reader.lines() {
102            file_lines += 1;
103            lines_processed += 1;
104            let line = line?;
105
106            // Update progress every 10k lines for smooth animation
107            if file_lines % 10_000 == 0 {
108                progress.update(file_lines);
109            }
110            let entry: AuditEntry = match serde_json::from_str(&line) {
111                Ok(e) => e,
112                Err(_) => continue,
113            };
114
115            // Look for login events in auth paths
116            let Some(request) = &entry.request else {
117                continue;
118            };
119
120            let Some(path) = &request.path else {
121                continue;
122            };
123
124            if !path.starts_with("auth/") {
125                continue;
126            }
127
128            if !path.contains("/login") {
129                continue;
130            }
131
132            // Skip if no auth info
133            let Some(auth) = &entry.auth else {
134                continue;
135            };
136
137            // Skip if no entity_id or display_name
138            let entity_id = match &auth.entity_id {
139                Some(id) if !id.is_empty() => id.clone(),
140                _ => continue,
141            };
142
143            let display_name = match &auth.display_name {
144                Some(name) if !name.is_empty() => name.clone(),
145                _ => continue,
146            };
147
148            login_events += 1;
149
150            // Extract mount path from the auth path (e.g., "auth/github/login" -> "auth/github")
151            let mount_path = path
152                .trim_end_matches("/login")
153                .trim_end_matches(&format!("/{}", display_name))
154                .to_string();
155
156            let mount_accessor = auth.accessor.clone().unwrap_or_default();
157            let username = auth
158                .metadata
159                .as_ref()
160                .and_then(|m| m.get("username"))
161                .and_then(|v| v.as_str())
162                .map(std::string::ToString::to_string);
163
164            // Update or insert entity mapping
165            entity_map
166                .entry(entity_id)
167                .and_modify(|mapping| {
168                    mapping.login_count += 1;
169                    mapping.last_seen.clone_from(&entry.time);
170                    // Update display_name if it's newer (handle case variations)
171                    if entry.time > mapping.last_seen {
172                        mapping.display_name.clone_from(&display_name);
173                    }
174                })
175                .or_insert_with(|| EntityMapping {
176                    display_name,
177                    mount_path,
178                    mount_accessor,
179                    username,
180                    login_count: 1,
181                    first_seen: entry.time.clone(),
182                    last_seen: entry.time.clone(),
183                });
184        }
185
186        // Ensure we show 100% complete for this file
187        progress.update(total_file_lines);
188
189        progress.finish_with_message(&format!("Processed {} lines from this file", file_lines));
190    }
191
192    eprintln!(
193        "\nTotal: Processed {} lines, found {} login events, tracked {} entities\n",
194        lines_processed,
195        login_events,
196        entity_map.len()
197    );
198
199    Ok(entity_map)
200}
201
202pub fn run(log_files: &[String], output: &str, format: &str) -> Result<()> {
203    eprintln!("Preprocessing audit logs...");
204    eprintln!("Extracting entity → display_name mappings from login events...\n");
205
206    let entity_map = build_entity_map(log_files)?;
207
208    // Write output based on format
209    eprintln!("\nWriting entity mappings to: {}", output);
210
211    match format.to_lowercase().as_str() {
212        "json" => {
213            let output_file = File::create(output)
214                .with_context(|| format!("Failed to create output file: {}", output))?;
215            let mut writer = std::io::BufWriter::new(output_file);
216
217            // Write as pretty JSON for readability
218            let json = serde_json::to_string_pretty(&entity_map)
219                .context("Failed to serialize entity mappings")?;
220            writer.write_all(json.as_bytes())?;
221            writer.flush()?;
222
223            eprintln!("JSON entity mapping file created successfully!\n");
224        }
225        "csv" => {
226            let output_file = File::create(output)
227                .with_context(|| format!("Failed to create output file: {}", output))?;
228            let mut csv_writer = csv::Writer::from_writer(output_file);
229
230            // Write CSV header
231            csv_writer.write_record([
232                "entity_id",
233                "display_name",
234                "mount_path",
235                "mount_accessor",
236                "username",
237                "login_count",
238                "first_seen",
239                "last_seen",
240            ])?;
241
242            // Write entity data
243            for (entity_id, mapping) in &entity_map {
244                csv_writer.write_record([
245                    entity_id,
246                    &mapping.display_name,
247                    &mapping.mount_path,
248                    &mapping.mount_accessor,
249                    mapping.username.as_deref().unwrap_or(""),
250                    &mapping.login_count.to_string(),
251                    &mapping.first_seen,
252                    &mapping.last_seen,
253                ])?;
254            }
255
256            csv_writer.flush()?;
257            eprintln!("✓ CSV entity mapping file created successfully!\n");
258        }
259        _ => {
260            anyhow::bail!("Invalid format '{}'. Use 'csv' or 'json'", format);
261        }
262    }
263
264    eprintln!("Usage with client-activity command:");
265    eprintln!(
266        "  vault-audit client-activity --start <START> --end <END> --entity-map {}",
267        output
268    );
269
270    Ok(())
271}