vault_audit_tools/commands/
entity_churn.rs

1//! Multi-day entity churn analysis with intelligent ephemeral pattern detection.
2//!
3//! ⚠️ **DEPRECATED**: Use `entity-analysis churn` instead.
4//!
5//! ```bash
6//! # Old (deprecated):
7//! vault-audit entity-churn day1.log day2.log day3.log
8//!
9//! # New (recommended):
10//! vault-audit entity-analysis churn day1.log day2.log day3.log
11//! ```
12//!
13//! See [`entity_analysis`](crate::commands::entity_analysis) for the unified command.
14//!
15//! ---
16//!
17//! Tracks entity lifecycle across multiple audit log files (compressed or uncompressed)
18//! to identify:
19//! - New entities appearing each day
20//! - Returning vs. churned entities
21//! - Entity persistence patterns
22//! - Authentication method usage trends
23//! - **Ephemeral entities** using data-driven pattern learning
24//!
25//! # Usage
26//!
27//! ```bash
28//! # Analyze entity churn across a week (compressed files)
29//! vault-audit entity-churn day1.log.gz day2.log.gz day3.log.gz day4.log.gz day5.log.gz day6.log.gz day7.log.gz
30//!
31//! # With baseline for accurate new entity detection
32//! vault-audit entity-churn *.log --baseline baseline_entities.json
33//!
34//! # With entity mappings for enriched display names
35//! vault-audit entity-churn *.log --baseline baseline.json --entity-map entity_mappings.json
36//!
37//! # Export detailed churn data with ephemeral analysis
38//! vault-audit entity-churn *.log --output entity_churn.json
39//!
40//! # Export as CSV format
41//! vault-audit entity-churn *.log --output entity_churn.csv --format csv
42//! ```
43//!
44//! **Compressed File Support**: Automatically handles `.gz` and `.zst` files - no manual
45//! decompression required. Mix compressed and uncompressed files freely.
46//!
47//! # Ephemeral Pattern Detection
48//!
49//! The command uses a sophisticated two-pass analysis to detect ephemeral entities
50//! (e.g., CI/CD pipeline entities, temporary build entities) with confidence scoring:
51//!
52//! **Pass 1: Data Collection**
53//! - Track all entities across log files
54//! - Record first/last seen times and files
55//! - Count login activity per entity
56//!
57//! **Pass 2: Pattern Learning & Classification**
58//! - Learn patterns from entities that appeared 1-2 days
59//! - Identify naming patterns (e.g., `github-repo:org/repo:ref:branch`)
60//! - Calculate confidence scores (0.0-1.0) based on:
61//!   - Days active (1 day = high confidence, 2 days = medium)
62//!   - Similar entities on same mount path
63//!   - Activity levels (low login counts)
64//!   - Gaps in activity (reduces confidence for sporadic access)
65//!
66//! # Output
67//!
68//! ## Entity Lifecycle Classification:
69//! - **`new_day_N`**: Entities first seen on day N (not in baseline)
70//! - **`pre_existing_baseline`**: Entities that existed before analysis period
71//!
72//! ## Activity Patterns:
73//! - **consistent**: Appeared in most/all log files
74//! - **sporadic**: Appeared intermittently with gaps
75//! - **declining**: Activity decreased over time
76//! - **`single_burst`**: Appeared only once
77//!
78//! ## Ephemeral Detection:
79//! - Confidence levels: High (≥70%), Medium (50-69%), Low (40-49%)
80//! - Detailed reasoning for each classification
81//! - Top ephemeral entities by confidence
82//! - Pattern statistics and mount path analysis
83//!
84//! # JSON Output Fields
85//!
86//! When using `--output`, each entity record includes:
87//! - `entity_id`: Vault entity identifier
88//! - `display_name`: Human-readable name
89//! - `first_seen_file` / `first_seen_time`: When first observed
90//! - `last_seen_file` / `last_seen_time`: When last observed
91//! - `files_appeared`: List of log files entity was active in
92//! - `total_logins`: Total login count across all files
93//! - `lifecycle`: Entity lifecycle classification
94//! - `activity_pattern`: Behavioral pattern classification
95//! - `is_ephemeral_pattern`: Boolean flag for ephemeral detection
96//! - `ephemeral_confidence`: Confidence score (0.0-1.0)
97//! - `ephemeral_reasons`: Array of human-readable reasons
98//!
99//! Only tracks entities that performed login operations (paths ending in `/login`).
100
101use crate::audit::types::AuditEntry;
102use crate::utils::format::format_number;
103use crate::utils::progress::ProgressBar;
104use crate::utils::reader::open_file;
105use anyhow::{Context, Result};
106use chrono::{DateTime, Utc};
107use serde::{Deserialize, Serialize};
108use std::collections::{HashMap, HashSet};
109use std::fs::File;
110use std::io::{BufRead, BufReader};
111use std::path::Path;
112
113/// Entity mapping from baseline CSV files
114#[derive(Debug, Serialize, Deserialize)]
115struct EntityMapping {
116    display_name: String,
117    mount_path: String,
118    #[allow(dead_code)]
119    mount_accessor: String,
120    #[allow(dead_code)]
121    login_count: usize,
122    #[allow(dead_code)]
123    first_seen: String,
124    #[allow(dead_code)]
125    last_seen: String,
126}
127
128/// Represents an entity's churn status
129#[derive(Debug, Serialize, Clone)]
130struct EntityChurnRecord {
131    entity_id: String,
132    display_name: String,
133    mount_path: String,
134    mount_type: String,
135    token_type: String,
136    first_seen_file: String,
137    first_seen_time: DateTime<Utc>,
138    last_seen_file: String,
139    last_seen_time: DateTime<Utc>,
140    files_appeared: Vec<String>,
141    total_logins: usize,
142    lifecycle: String, // "new_day_1", "new_day_2", "new_day_3", "pre_existing"
143    activity_pattern: String, // "consistent", "sporadic", "declining", "single_burst", "unknown"
144    is_ephemeral_pattern: bool,
145    ephemeral_confidence: f32, // 0.0 to 1.0
146    ephemeral_reasons: Vec<String>,
147    // Baseline metadata (if entity existed in baseline)
148    #[serde(skip_serializing_if = "Option::is_none")]
149    baseline_entity_name: Option<String>,
150    #[serde(skip_serializing_if = "Option::is_none")]
151    baseline_created: Option<String>,
152    #[serde(skip_serializing_if = "Option::is_none")]
153    baseline_alias_name: Option<String>,
154    #[serde(skip_serializing_if = "Option::is_none")]
155    baseline_mount_path: Option<String>,
156    // Entity-map metadata (from historical audit logs via preprocess-entities)
157    #[serde(skip_serializing_if = "Option::is_none")]
158    historical_display_name: Option<String>,
159    #[serde(skip_serializing_if = "Option::is_none")]
160    historical_first_seen: Option<String>,
161    #[serde(skip_serializing_if = "Option::is_none")]
162    historical_last_seen: Option<String>,
163    #[serde(skip_serializing_if = "Option::is_none")]
164    historical_login_count: Option<usize>,
165}
166
167/// CSV-compatible representation of entity churn record
168#[derive(Debug, Serialize)]
169struct EntityChurnRecordCsv {
170    entity_id: String,
171    display_name: String,
172    mount_path: String,
173    mount_type: String,
174    token_type: String,
175    first_seen_file: String,
176    first_seen_time: String,
177    last_seen_file: String,
178    last_seen_time: String,
179    files_appeared: String, // Comma-separated list
180    days_active: usize,
181    total_logins: usize,
182    lifecycle: String,
183    activity_pattern: String,
184    is_ephemeral_pattern: bool,
185    ephemeral_confidence: f32,
186    ephemeral_reasons: String, // Semicolon-separated list
187    baseline_entity_name: String,
188    baseline_created: String,
189    baseline_alias_name: String,
190    baseline_mount_path: String,
191    historical_display_name: String,
192    historical_first_seen: String,
193    historical_last_seen: String,
194    historical_login_count: String,
195}
196
197impl From<EntityChurnRecord> for EntityChurnRecordCsv {
198    fn from(record: EntityChurnRecord) -> Self {
199        Self {
200            entity_id: record.entity_id,
201            display_name: record.display_name,
202            mount_path: record.mount_path,
203            mount_type: record.mount_type,
204            token_type: record.token_type,
205            first_seen_file: record.first_seen_file,
206            first_seen_time: record.first_seen_time.to_rfc3339(),
207            last_seen_file: record.last_seen_file,
208            last_seen_time: record.last_seen_time.to_rfc3339(),
209            files_appeared: record.files_appeared.join(", "),
210            days_active: record.files_appeared.len(),
211            total_logins: record.total_logins,
212            lifecycle: record.lifecycle,
213            activity_pattern: record.activity_pattern,
214            is_ephemeral_pattern: record.is_ephemeral_pattern,
215            ephemeral_confidence: record.ephemeral_confidence,
216            ephemeral_reasons: record.ephemeral_reasons.join("; "),
217            baseline_entity_name: record.baseline_entity_name.unwrap_or_default(),
218            baseline_created: record.baseline_created.unwrap_or_default(),
219            baseline_alias_name: record.baseline_alias_name.unwrap_or_default(),
220            baseline_mount_path: record.baseline_mount_path.unwrap_or_default(),
221            historical_display_name: record.historical_display_name.unwrap_or_default(),
222            historical_first_seen: record.historical_first_seen.unwrap_or_default(),
223            historical_last_seen: record.historical_last_seen.unwrap_or_default(),
224            historical_login_count: record
225                .historical_login_count
226                .map(|n| n.to_string())
227                .unwrap_or_default(),
228        }
229    }
230}
231
232#[derive(Debug, Clone)]
233struct DailyStats {
234    #[allow(dead_code)]
235    file_name: String,
236    new_entities: usize,
237    returning_entities: usize,
238    total_logins: usize,
239}
240
241/// Analyzes entity behavior patterns to detect ephemeral entities
242#[derive(Debug)]
243struct EphemeralPatternAnalyzer {
244    total_files: usize,
245    short_lived_patterns: Vec<ShortLivedPattern>,
246}
247
248#[derive(Debug)]
249struct ShortLivedPattern {
250    days_active: usize,
251    display_name: String,
252    mount_path: String,
253}
254
255impl EphemeralPatternAnalyzer {
256    const fn new(total_files: usize) -> Self {
257        Self {
258            total_files,
259            short_lived_patterns: Vec::new(),
260        }
261    }
262
263    /// Learn patterns from entities that appeared 1-2 days (potential ephemeral patterns)
264    fn learn_from_entities(&mut self, entities: &HashMap<String, EntityChurnRecord>) {
265        for entity in entities.values() {
266            let days_active = entity.files_appeared.len();
267
268            // Learn from entities that appeared 1-2 days only
269            if days_active <= 2 {
270                self.short_lived_patterns.push(ShortLivedPattern {
271                    days_active,
272                    display_name: entity.display_name.clone(),
273                    mount_path: entity.mount_path.clone(),
274                });
275            }
276        }
277    }
278
279    /// Analyze an entity and determine if it matches ephemeral patterns
280    fn analyze_entity(&self, entity: &EntityChurnRecord) -> (bool, f32, Vec<String>) {
281        let days_active = entity.files_appeared.len();
282        let mut confidence = 0.0;
283        let mut reasons = Vec::new();
284
285        // Strong indicators (high confidence)
286        if days_active == 1 {
287            confidence += 0.5;
288            reasons.push(format!("Appeared only 1 day ({})", entity.first_seen_file));
289        } else if days_active == 2 {
290            confidence += 0.3;
291            reasons.push(format!(
292                "Appeared only 2 days: {}, {}",
293                entity.files_appeared.first().unwrap_or(&String::new()),
294                entity.files_appeared.last().unwrap_or(&String::new())
295            ));
296        }
297
298        // Pattern matching: Check if display name follows patterns seen in other short-lived entities
299        if days_active <= 2 {
300            // Count how many other short-lived entities share similar patterns
301            let similar_count = self
302                .short_lived_patterns
303                .iter()
304                .filter(|p| {
305                    // Same mount path
306                    if p.mount_path == entity.mount_path && p.days_active <= 2 {
307                        return true;
308                    }
309                    // Similar naming pattern (e.g., github-repo:* or airflow-*)
310                    if entity.display_name.contains(':') && p.display_name.contains(':') {
311                        let entity_prefix = entity.display_name.split(':').next().unwrap_or("");
312                        let pattern_prefix = p.display_name.split(':').next().unwrap_or("");
313                        if entity_prefix == pattern_prefix && !entity_prefix.is_empty() {
314                            return true;
315                        }
316                    }
317                    false
318                })
319                .count();
320
321            if similar_count > 5 {
322                confidence += 0.2;
323                reasons.push(format!(
324                    "Matches pattern seen in {} other short-lived entities",
325                    similar_count
326                ));
327            } else if similar_count > 0 {
328                confidence += 0.1;
329                reasons.push(format!(
330                    "Similar to {} other short-lived entities",
331                    similar_count
332                ));
333            }
334        }
335
336        // Low activity indicator
337        if entity.total_logins <= 5 && days_active <= 2 {
338            confidence += 0.1;
339            reasons.push(format!(
340                "Low activity: only {} login(s)",
341                entity.total_logins
342            ));
343        }
344
345        // Non-continuous appearance (sporadic pattern suggests not churned, just periodic)
346        if days_active >= 2 {
347            let first_day_idx = entity.files_appeared.first().and_then(|f| {
348                f.split('_')
349                    .next_back()
350                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
351            });
352            let last_day_idx = entity.files_appeared.last().and_then(|f| {
353                f.split('_')
354                    .next_back()
355                    .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
356            });
357
358            if let (Some(first), Some(last)) = (first_day_idx, last_day_idx) {
359                let span = last - first + 1;
360                if span > days_active {
361                    // Gaps in activity - reduce confidence
362                    confidence *= 0.7;
363                    reasons.push(
364                        "Has gaps in activity (possibly sporadic access, not churned)".to_string(),
365                    );
366                }
367            }
368        }
369
370        // Cap confidence and determine ephemeral status
371        confidence = f32::min(confidence, 1.0);
372        let is_ephemeral = confidence >= 0.4; // Threshold for classification
373
374        // Add absence indicator if not seen in recent files
375        if is_ephemeral && days_active < self.total_files {
376            reasons.push(format!(
377                "Not seen in most recent {} file(s)",
378                self.total_files - days_active
379            ));
380        }
381
382        (is_ephemeral, confidence, reasons)
383    }
384
385    /// Determine activity pattern based on appearance across files
386    fn classify_activity_pattern(&self, entity: &EntityChurnRecord) -> String {
387        let days_active = entity.files_appeared.len();
388
389        if days_active == 1 {
390            return "single_burst".to_string();
391        }
392
393        if days_active == self.total_files {
394            return "consistent".to_string();
395        }
396
397        if days_active >= (self.total_files * 2) / 3 {
398            return "consistent".to_string();
399        }
400
401        // Check if activity is declining (appeared early but stopped)
402        if let (Some(_first_file), Some(last_file)) =
403            (entity.files_appeared.first(), entity.files_appeared.last())
404        {
405            // Simple heuristic: if last seen was in first half of files, it's declining
406            let last_file_num = last_file
407                .split('_')
408                .next_back()
409                .and_then(|s| s.trim_end_matches(".log").parse::<usize>().ok())
410                .unwrap_or(self.total_files);
411
412            if last_file_num < self.total_files / 2 {
413                return "declining".to_string();
414            }
415        }
416
417        if days_active <= 2 {
418            return "single_burst".to_string();
419        }
420
421        "sporadic".to_string()
422    }
423}
424
425fn get_file_size(path: &str) -> Result<u64> {
426    Ok(std::fs::metadata(path)?.len())
427}
428
429fn load_entity_mappings(path: &str) -> Result<HashMap<String, EntityMapping>> {
430    let file = File::open(path).context("Failed to open entity map file")?;
431    let mappings: HashMap<String, EntityMapping> =
432        serde_json::from_reader(file).context("Failed to parse entity map JSON")?;
433    Ok(mappings)
434}
435
436#[derive(Debug, Deserialize, Clone)]
437#[allow(dead_code)]
438struct BaselineEntity {
439    entity_id: String,
440    // Fields from entity-list (Vault API) - full metadata
441    #[serde(default)]
442    entity_name: String,
443    #[serde(default)]
444    entity_disabled: bool,
445    #[serde(default)]
446    entity_created: String,
447    #[serde(default)]
448    entity_updated: String,
449    #[serde(default)]
450    alias_id: String,
451    #[serde(default)]
452    alias_name: String,
453    #[serde(default)]
454    mount_path: String,
455    #[serde(default)]
456    mount_type: String,
457    #[serde(default)]
458    mount_accessor: String,
459    #[serde(default)]
460    alias_created: String,
461    #[serde(default)]
462    alias_updated: String,
463    #[serde(default)]
464    alias_metadata: String,
465}
466
467impl BaselineEntity {
468    /// Get the best available name (`entity_name` if available, otherwise `alias_name`)
469    fn get_name(&self) -> String {
470        if !self.entity_name.is_empty() {
471            self.entity_name.clone()
472        } else if !self.alias_name.is_empty() {
473            self.alias_name.clone()
474        } else {
475            String::new()
476        }
477    }
478
479    /// Get the entity creation time
480    fn get_created(&self) -> String {
481        self.entity_created.clone()
482    }
483}
484
485fn load_baseline_entities(path: &str) -> Result<HashMap<String, BaselineEntity>> {
486    let file = File::open(path).context("Failed to open baseline entities file")?;
487
488    // Check if it's JSON or CSV based on file extension
489    let path_lower = path.to_lowercase();
490    if std::path::Path::new(&path_lower)
491        .extension()
492        .is_some_and(|ext| ext.eq_ignore_ascii_case("json"))
493    {
494        // JSON format from entity-list with --format json
495        let entities: Vec<BaselineEntity> =
496            serde_json::from_reader(file).context("Failed to parse baseline entities JSON")?;
497        Ok(entities
498            .into_iter()
499            .map(|e| (e.entity_id.clone(), e))
500            .collect())
501    } else {
502        // CSV format (default entity-list output)
503        let mut reader = csv::Reader::from_reader(file);
504        let mut entities = HashMap::with_capacity(5000); // Pre-allocate for entity mappings
505
506        for result in reader.deserialize() {
507            let entity: BaselineEntity = result.context("Failed to parse baseline CSV row")?;
508            // Use first occurrence of each entity_id (entities can have multiple aliases)
509            entities.entry(entity.entity_id.clone()).or_insert(entity);
510        }
511
512        Ok(entities)
513    }
514}
515
516pub fn run(
517    log_files: &[String],
518    entity_map: Option<&str>,
519    baseline_entities: Option<&str>,
520    output: Option<&str>,
521    format: Option<&str>,
522) -> Result<()> {
523    println!("\n=== Multi-Day Entity Churn Analysis ===\n");
524    println!("Analyzing {} log files:", log_files.len());
525    for (i, file) in log_files.iter().enumerate() {
526        let size = get_file_size(file)?;
527        println!(
528            "  Day {}: {} ({:.2} GB)",
529            i + 1,
530            file,
531            size as f64 / 1_000_000_000.0
532        );
533    }
534    println!();
535
536    // Load baseline entities if provided
537    let baseline = if let Some(path) = baseline_entities {
538        println!(
539            "Loading baseline entity list (Vault API metadata) from {}...",
540            path
541        );
542        let baseline_set = load_baseline_entities(path)?;
543        println!(
544            "Loaded {} pre-existing entities from Vault API baseline",
545            format_number(baseline_set.len())
546        );
547        println!();
548        Some(baseline_set)
549    } else {
550        println!("No baseline entity list provided. Cannot distinguish truly NEW entities from pre-existing.");
551        println!("   All Day 1 entities will be marked as 'pre_existing_or_new_day_1'.");
552        println!("   To get accurate results, run: ./vault-audit entity-list --output baseline_entities.json\n");
553        None
554    };
555
556    // Load entity mappings if provided (historical data from audit logs)
557    let entity_mappings = if let Some(path) = entity_map {
558        println!(
559            "Loading historical entity mappings (audit log enrichment) from {}...",
560            path
561        );
562        let mappings = load_entity_mappings(path)?;
563        println!(
564            "Loaded {} entity mappings with historical audit log data",
565            format_number(mappings.len())
566        );
567        println!();
568        Some(mappings)
569    } else {
570        None
571    };
572
573    // Track all entities across all files
574    // Pre-allocate for typical entity counts in enterprise environments
575    let mut entities: HashMap<String, EntityChurnRecord> = HashMap::with_capacity(5000);
576    let mut daily_stats: Vec<DailyStats> = Vec::new();
577
578    // Process each log file in order
579    for (file_idx, log_file) in log_files.iter().enumerate() {
580        let file_name = Path::new(log_file)
581            .file_name()
582            .unwrap()
583            .to_string_lossy()
584            .to_string();
585
586        println!("\nProcessing Day {} ({})...", file_idx + 1, file_name);
587
588        let file = open_file(log_file)
589            .with_context(|| format!("Failed to open log file: {}", log_file))?;
590        let file_size = get_file_size(log_file)? as usize;
591
592        let reader = BufReader::new(file);
593        let mut progress = ProgressBar::new(file_size, "Processing");
594
595        let mut new_entities_this_file = 0;
596        let mut returning_entities_this_file = HashSet::new();
597        let mut logins_this_file = 0;
598        let mut bytes_processed = 0;
599
600        for line in reader.lines() {
601            let line = line.context("Failed to read line from log file")?;
602            bytes_processed += line.len() + 1; // +1 for newline
603
604            // Update progress periodically
605            if bytes_processed % 10_000 == 0 {
606                progress.update(bytes_processed.min(file_size));
607            }
608
609            let trimmed = line.trim();
610            if trimmed.is_empty() {
611                continue;
612            }
613
614            let entry: AuditEntry = match serde_json::from_str(trimmed) {
615                Ok(e) => e,
616                Err(_) => continue,
617            };
618
619            // Only process login operations (auth paths ending in /login)
620            let Some(ref request) = entry.request else {
621                continue;
622            };
623            let Some(ref path) = request.path else {
624                continue;
625            };
626            if !path.ends_with("/login") {
627                continue;
628            }
629
630            logins_this_file += 1;
631
632            // Extract entity info
633            let Some(ref auth) = entry.auth else {
634                continue;
635            };
636            let Some(ref entity_id) = auth.entity_id else {
637                continue;
638            };
639
640            let display_name = auth
641                .display_name
642                .clone()
643                .unwrap_or_else(|| entity_id.clone());
644            let mount_path = request.path.clone().unwrap_or_default();
645            let mount_type = request.mount_type.clone().unwrap_or_default();
646            let token_type = auth.token_type.clone().unwrap_or_default();
647
648            // Parse timestamp
649            let first_seen_time = chrono::DateTime::parse_from_rfc3339(&entry.time)
650                .ok()
651                .map_or_else(Utc::now, |dt| dt.with_timezone(&Utc));
652
653            // Check if this entity exists from a previous file
654            if let Some(entity_record) = entities.get_mut(entity_id) {
655                // Returning entity
656                entity_record.total_logins += 1;
657                entity_record.last_seen_file.clone_from(&file_name);
658                entity_record.last_seen_time = first_seen_time;
659                if !entity_record.files_appeared.contains(&file_name) {
660                    entity_record.files_appeared.push(file_name.clone());
661                }
662                returning_entities_this_file.insert(entity_id.clone());
663            } else {
664                // New entity (first time across all files)
665                new_entities_this_file += 1;
666
667                // Determine lifecycle based on baseline and which file this is
668                let lifecycle = if let Some(ref baseline_set) = baseline {
669                    if baseline_set.contains_key(entity_id) {
670                        "pre_existing_baseline".to_string()
671                    } else {
672                        // Not in baseline, so truly NEW during analysis period
673                        match file_idx {
674                            0 => "new_day_1".to_string(),
675                            1 => "new_day_2".to_string(),
676                            2 => "new_day_3".to_string(),
677                            _ => format!("new_day_{}", file_idx + 1),
678                        }
679                    }
680                } else {
681                    // No baseline provided, can't distinguish
682                    match file_idx {
683                        0 => "pre_existing_or_new_day_1".to_string(),
684                        1 => "new_day_2".to_string(),
685                        2 => "new_day_3".to_string(),
686                        _ => format!("new_day_{}", file_idx + 1),
687                    }
688                };
689
690                // Get baseline metadata if entity exists in baseline
691                let (
692                    baseline_entity_name,
693                    baseline_created,
694                    baseline_alias_name,
695                    baseline_mount_path,
696                ) = if let Some(ref baseline_map) = baseline {
697                    if let Some(baseline_entity) = baseline_map.get(entity_id) {
698                        let name = baseline_entity.get_name();
699                        let created = baseline_entity.get_created();
700                        (
701                            if name.is_empty() { None } else { Some(name) },
702                            if created.is_empty() {
703                                None
704                            } else {
705                                Some(created)
706                            },
707                            if baseline_entity.alias_name.is_empty() {
708                                None
709                            } else {
710                                Some(baseline_entity.alias_name.clone())
711                            },
712                            if baseline_entity.mount_path.is_empty() {
713                                None
714                            } else {
715                                Some(baseline_entity.mount_path.clone())
716                            },
717                        )
718                    } else {
719                        (None, None, None, None)
720                    }
721                } else {
722                    (None, None, None, None)
723                };
724
725                // Fetch historical data from entity_mappings
726                let (
727                    historical_display_name,
728                    historical_first_seen,
729                    historical_last_seen,
730                    historical_login_count,
731                ) = if let Some(ref mappings) = entity_mappings {
732                    if let Some(mapping) = mappings.get(entity_id) {
733                        (
734                            Some(mapping.display_name.clone()),
735                            Some(mapping.first_seen.clone()),
736                            Some(mapping.last_seen.clone()),
737                            Some(mapping.login_count),
738                        )
739                    } else {
740                        (None, None, None, None)
741                    }
742                } else {
743                    (None, None, None, None)
744                };
745
746                entities.insert(
747                    entity_id.clone(),
748                    EntityChurnRecord {
749                        entity_id: entity_id.clone(),
750                        display_name: display_name.clone(),
751                        mount_path: mount_path.clone(),
752                        mount_type: mount_type.clone(),
753                        token_type: token_type.clone(),
754                        first_seen_file: file_name.clone(),
755                        first_seen_time,
756                        last_seen_file: file_name.clone(),
757                        last_seen_time: first_seen_time,
758                        files_appeared: vec![file_name.clone()],
759                        total_logins: 1,
760                        lifecycle,
761                        activity_pattern: "unknown".to_string(), // Will be computed in second pass
762                        is_ephemeral_pattern: false,             // Will be computed in second pass
763                        ephemeral_confidence: 0.0,               // Will be computed in second pass
764                        ephemeral_reasons: Vec::new(),           // Will be computed in second pass
765                        baseline_entity_name,
766                        baseline_created,
767                        baseline_alias_name,
768                        baseline_mount_path,
769                        historical_display_name,
770                        historical_first_seen,
771                        historical_last_seen,
772                        historical_login_count,
773                    },
774                );
775            }
776        }
777
778        progress.finish();
779
780        daily_stats.push(DailyStats {
781            file_name,
782            new_entities: new_entities_this_file,
783            returning_entities: returning_entities_this_file.len(),
784            total_logins: logins_this_file,
785        });
786
787        println!(
788            "Day {} Summary: {} new entities, {} returning, {} logins",
789            file_idx + 1,
790            format_number(new_entities_this_file),
791            format_number(returning_entities_this_file.len()),
792            format_number(logins_this_file)
793        );
794    }
795
796    // === SECOND PASS: Analyze patterns and classify entities ===
797    println!("\nAnalyzing entity behavior patterns...");
798
799    let mut analyzer = EphemeralPatternAnalyzer::new(log_files.len());
800
801    // Step 1: Learn patterns from short-lived entities
802    analyzer.learn_from_entities(&entities);
803    println!(
804        "Learned from {} short-lived entity patterns",
805        format_number(analyzer.short_lived_patterns.len())
806    );
807
808    // Step 2: Classify all entities using learned patterns
809    let entity_ids: Vec<String> = entities.keys().cloned().collect();
810    for entity_id in entity_ids {
811        if let Some(entity) = entities.get_mut(&entity_id) {
812            // Classify activity pattern
813            entity.activity_pattern = analyzer.classify_activity_pattern(entity);
814
815            // Analyze for ephemeral patterns
816            let (is_ephemeral, confidence, reasons) = analyzer.analyze_entity(entity);
817            entity.is_ephemeral_pattern = is_ephemeral;
818            entity.ephemeral_confidence = confidence;
819            entity.ephemeral_reasons = reasons;
820        }
821    }
822
823    // Generate final report
824    println!("\n=== Entity Churn Analysis ===\n");
825
826    println!("Daily Breakdown:");
827    for (idx, stats) in daily_stats.iter().enumerate() {
828        println!(
829            "  Day {}: {} new, {} returning, {} total logins",
830            idx + 1,
831            format_number(stats.new_entities),
832            format_number(stats.returning_entities),
833            format_number(stats.total_logins)
834        );
835    }
836
837    // Lifecycle classification
838    let mut lifecycle_counts: HashMap<String, usize> = HashMap::with_capacity(20); // Small set of lifecycle categories
839    let mut entities_by_file_count: HashMap<usize, usize> = HashMap::with_capacity(log_files.len());
840
841    for entity in entities.values() {
842        *lifecycle_counts
843            .entry(entity.lifecycle.clone())
844            .or_insert(0) += 1;
845        *entities_by_file_count
846            .entry(entity.files_appeared.len())
847            .or_insert(0) += 1;
848    }
849
850    println!("\nEntity Lifecycle Classification:");
851    let mut lifecycle_vec: Vec<_> = lifecycle_counts.iter().collect();
852    lifecycle_vec.sort_by_key(|(k, _)| *k);
853    for (lifecycle, count) in lifecycle_vec {
854        println!("  {}: {}", lifecycle, format_number(*count));
855    }
856
857    println!("\nEntity Persistence:");
858    for day_count in 1..=log_files.len() {
859        if let Some(count) = entities_by_file_count.get(&day_count) {
860            let label = if day_count == 1 {
861                "Appeared 1 day only"
862            } else if day_count == log_files.len() {
863                "Appeared all days (persistent)"
864            } else {
865                "Appeared some days"
866            };
867            println!(
868                "  {} day(s): {} entities ({})",
869                day_count,
870                format_number(*count),
871                label
872            );
873        }
874    }
875
876    // Activity pattern analysis
877    let mut activity_pattern_counts: HashMap<String, usize> = HashMap::with_capacity(10); // Small set of activity patterns
878    let mut ephemeral_entities = Vec::new();
879
880    for entity in entities.values() {
881        *activity_pattern_counts
882            .entry(entity.activity_pattern.clone())
883            .or_insert(0) += 1;
884
885        if entity.is_ephemeral_pattern {
886            ephemeral_entities.push(entity.clone());
887        }
888    }
889
890    println!("\nActivity Pattern Distribution:");
891    let mut pattern_vec: Vec<_> = activity_pattern_counts.iter().collect();
892    pattern_vec.sort_by(|a, b| b.1.cmp(a.1));
893    for (pattern, count) in pattern_vec {
894        println!("  {}: {}", pattern, format_number(*count));
895    }
896
897    println!("\nEphemeral Entity Detection:");
898    println!(
899        "  Detected {} likely ephemeral entities (confidence ≥ 0.4)",
900        format_number(ephemeral_entities.len())
901    );
902
903    if !ephemeral_entities.is_empty() {
904        // Sort by confidence
905        ephemeral_entities.sort_by(|a, b| {
906            b.ephemeral_confidence
907                .partial_cmp(&a.ephemeral_confidence)
908                .unwrap_or(std::cmp::Ordering::Equal)
909        });
910
911        println!("  Top 10 by confidence:");
912        for (idx, entity) in ephemeral_entities.iter().take(10).enumerate() {
913            println!(
914                "    {}. {} (confidence: {:.1}%)",
915                idx + 1,
916                entity.display_name,
917                entity.ephemeral_confidence * 100.0
918            );
919            for reason in &entity.ephemeral_reasons {
920                println!("       - {}", reason);
921            }
922        }
923
924        // Breakdown by confidence ranges
925        let high_conf = ephemeral_entities
926            .iter()
927            .filter(|e| e.ephemeral_confidence >= 0.7)
928            .count();
929        let med_conf = ephemeral_entities
930            .iter()
931            .filter(|e| e.ephemeral_confidence >= 0.5 && e.ephemeral_confidence < 0.7)
932            .count();
933        let low_conf = ephemeral_entities
934            .iter()
935            .filter(|e| e.ephemeral_confidence >= 0.4 && e.ephemeral_confidence < 0.5)
936            .count();
937
938        println!("\n  Confidence distribution:");
939        println!("    High (≥70%): {}", format_number(high_conf));
940        println!("    Medium (50-69%): {}", format_number(med_conf));
941        println!("    Low (40-49%): {}", format_number(low_conf));
942    }
943
944    // Mount path breakdown
945    let mut mount_stats: HashMap<String, (usize, String)> = HashMap::with_capacity(100); // Typical: dozens of mount points
946    for entity in entities.values() {
947        let entry = mount_stats
948            .entry(entity.mount_path.clone())
949            .or_insert_with(|| (0, entity.mount_type.clone()));
950        entry.0 += 1;
951    }
952
953    println!("\nTop Authentication Methods (Total Entities):");
954    let mut mount_vec: Vec<_> = mount_stats.iter().collect();
955    mount_vec.sort_by(|a, b| b.1 .0.cmp(&a.1 .0));
956
957    for (idx, (path, (count, mount_type))) in mount_vec.iter().take(20).enumerate() {
958        println!(
959            "  {}. {} ({}): {}",
960            idx + 1,
961            path,
962            mount_type,
963            format_number(*count)
964        );
965    }
966
967    // Calculate GitHub duplication if present
968    let github_entities: Vec<_> = entities
969        .values()
970        .filter(|e| e.mount_path.contains("/github"))
971        .collect();
972
973    if !github_entities.is_empty() {
974        println!("\n=== GitHub Entity Analysis ===");
975        println!(
976            "Total GitHub entities: {}",
977            format_number(github_entities.len())
978        );
979
980        // Extract repo names and count duplicates
981        let mut repo_counts: HashMap<String, usize> = HashMap::new();
982        for entity in &github_entities {
983            // Extract repo from "github-repo:org/repo:..." pattern
984            if let Some(repo) = entity.display_name.split(':').nth(1) {
985                *repo_counts.entry(repo.to_string()).or_insert(0) += 1;
986            }
987        }
988
989        println!("Unique repositories: {}", format_number(repo_counts.len()));
990        println!("\nTop repositories by entity count:");
991        let mut repo_vec: Vec<_> = repo_counts.iter().collect();
992        repo_vec.sort_by(|a, b| b.1.cmp(a.1));
993
994        for (idx, (repo, count)) in repo_vec.iter().take(20).enumerate() {
995            if **count > 1 {
996                println!(
997                    "  {}. {}: {} entities",
998                    idx + 1,
999                    repo,
1000                    format_number(**count)
1001                );
1002            }
1003        }
1004    }
1005
1006    // Export to file if requested
1007    if let Some(output_path) = output {
1008        let mut entities_vec: Vec<_> = entities.into_values().collect();
1009        entities_vec.sort_by(|a, b| a.first_seen_time.cmp(&b.first_seen_time));
1010
1011        // Determine format from parameter or file extension
1012        let output_format = format.unwrap_or_else(|| {
1013            if std::path::Path::new(output_path)
1014                .extension()
1015                .is_some_and(|ext| ext.eq_ignore_ascii_case("csv"))
1016            {
1017                "csv"
1018            } else {
1019                "json"
1020            }
1021        });
1022
1023        println!(
1024            "\nExporting detailed entity records to {} (format: {})...",
1025            output_path, output_format
1026        );
1027
1028        let output_file = File::create(output_path)
1029            .with_context(|| format!("Failed to create output file: {}", output_path))?;
1030
1031        match output_format {
1032            "csv" => {
1033                let mut writer = csv::Writer::from_writer(output_file);
1034                for entity in &entities_vec {
1035                    let csv_record: EntityChurnRecordCsv = entity.clone().into();
1036                    writer
1037                        .serialize(&csv_record)
1038                        .context("Failed to write CSV record")?;
1039                }
1040                writer.flush().context("Failed to flush CSV writer")?;
1041            }
1042            _ => {
1043                // Default to JSON
1044                serde_json::to_writer_pretty(output_file, &entities_vec)
1045                    .context("Failed to write JSON output")?;
1046            }
1047        }
1048
1049        println!(
1050            "Exported {} entity records",
1051            format_number(entities_vec.len())
1052        );
1053    }
1054
1055    println!("\n=== Analysis Complete ===\n");
1056    Ok(())
1057}