From 7dd231b164b88187903e27d76a09f0988ebafa06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Thu, 1 Oct 2020 11:31:44 +0200 Subject: [PATCH] Add implementation of FSWatch for file scanner --- filebeat/input/filestream/fswatch.go | 387 ++++++++++++++++++ filebeat/input/filestream/fswatch_test.go | 300 ++++++++++++++ .../input/filestream/testdata/excluded_file | 0 .../input/filestream/testdata/included_file | 0 .../testdata/symlink_to_included_file | 1 + 5 files changed, 688 insertions(+) create mode 100644 filebeat/input/filestream/fswatch.go create mode 100644 filebeat/input/filestream/fswatch_test.go create mode 100644 filebeat/input/filestream/testdata/excluded_file create mode 100644 filebeat/input/filestream/testdata/included_file create mode 120000 filebeat/input/filestream/testdata/symlink_to_included_file diff --git a/filebeat/input/filestream/fswatch.go b/filebeat/input/filestream/fswatch.go new file mode 100644 index 00000000000..a551d1cb9a1 --- /dev/null +++ b/filebeat/input/filestream/fswatch.go @@ -0,0 +1,387 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package filestream + +import ( + "fmt" + "os" + "path/filepath" + "time" + + "github.com/elastic/beats/v7/filebeat/input/file" + loginp "github.com/elastic/beats/v7/filebeat/input/filestream/internal/input-logfile" + "github.com/elastic/beats/v7/libbeat/common" + "github.com/elastic/beats/v7/libbeat/common/match" + "github.com/elastic/beats/v7/libbeat/logp" + "github.com/elastic/go-concert/unison" +) + +const ( + recursiveGlobDepth = 8 + scannerName = "scanner" + watcherDebugKey = "file_watcher" +) + +var ( + watcherFactories = map[string]watcherFactory{ + scannerName: newScannerWatcher, + } +) + +type watcherFactory func(paths []string, cfg *common.Config) (loginp.FSWatcher, error) + +// fileScanner looks for files which match the patterns in paths. +// It is able to exclude files and symlinks. +type fileScanner struct { + paths []string + excludedFiles []match.Matcher + symlinks bool + + log *logp.Logger +} + +type fileWatcherConfig struct { + // Interval is the time between two scans. + Interval time.Duration + // Scanner is the configuration of the scanner. + Scanner fileScannerConfig +} + +// fileWatcher gets the list of files from a FSWatcher and creates events by +// comparing the files between its last two runs. +type fileWatcher struct { + interval time.Duration + prev map[string]os.FileInfo + scanner loginp.FSScanner + log *logp.Logger + events chan loginp.FSEvent +} + +func newFileWatcher(paths []string, ns *common.ConfigNamespace) (loginp.FSWatcher, error) { + if ns == nil { + return newScannerWatcher(paths, nil) + } + + watcherType := ns.Name() + f, ok := watcherFactories[watcherType] + if !ok { + return nil, fmt.Errorf("no such file watcher: %s", watcherType) + } + + return f(paths, ns.Config()) +} + +func newScannerWatcher(paths []string, c *common.Config) (loginp.FSWatcher, error) { + config := defaultFileWatcherConfig() + err := c.Unpack(&config) + if err != nil { + return nil, err + } + scanner, err := newFileScanner(paths, config.Scanner) + if err != nil { + return nil, err + } + return &fileWatcher{ + log: logp.NewLogger(watcherDebugKey), + interval: config.Interval, + prev: make(map[string]os.FileInfo, 0), + scanner: scanner, + events: make(chan loginp.FSEvent), + }, nil +} + +func defaultFileWatcherConfig() fileWatcherConfig { + return fileWatcherConfig{ + Interval: 10 * time.Second, + Scanner: defaultFileScannerConfig(), + } +} + +func (w *fileWatcher) Run(ctx unison.Canceler) { + defer close(w.events) + + ticker := time.NewTicker(w.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + w.watch(ctx) + } + } +} + +func (w *fileWatcher) watch(ctx unison.Canceler) { + w.log.Info("Start next scan") + + paths := w.scanner.GetFiles() + files := getKeys(paths) + + newFiles := make(map[string]os.FileInfo) + + for i := 0; i < len(paths); i++ { + path := files[i] + info := paths[path] + + prevInfo, ok := w.prev[path] + if !ok { + newFiles[path] = paths[path] + continue + } + + if prevInfo.ModTime() != info.ModTime() { + select { + case <-ctx.Done(): + return + case w.events <- w.writeEvent(path, info): + } + } + + // delete from previous state, as we have more up to date info + delete(w.prev, path) + } + + // remaining files are in the prev map are the ones that are missing + // either because they have been deleted or renamed + for deletedPath, deletedInfo := range w.prev { + for newPath, newInfo := range newFiles { + if os.SameFile(deletedInfo, newInfo) { + select { + case <-ctx.Done(): + return + case w.events <- w.renamedEvent(deletedPath, newPath, newInfo): + delete(newFiles, newPath) + goto CHECK_NEXT_DELETED + } + } + } + + select { + case <-ctx.Done(): + return + case w.events <- w.deleteEvent(deletedPath, deletedInfo): + } + CHECK_NEXT_DELETED: + } + + // remaining files in newFiles are new + for path, info := range newFiles { + select { + case <-ctx.Done(): + return + case w.events <- w.createEvent(path, info): + } + + } + + w.log.Debugf("Found %d paths", len(paths)) + w.prev = paths +} + +func getKeys(paths map[string]os.FileInfo) []string { + files := make([]string, 0) + for file := range paths { + files = append(files, file) + } + return files +} + +func (w *fileWatcher) createEvent(path string, fi os.FileInfo) loginp.FSEvent { + return loginp.FSEvent{Op: loginp.OpCreate, OldPath: "", NewPath: path, Info: fi} +} + +func (w *fileWatcher) writeEvent(path string, fi os.FileInfo) loginp.FSEvent { + return loginp.FSEvent{Op: loginp.OpWrite, OldPath: path, NewPath: path, Info: fi} +} + +func (w *fileWatcher) renamedEvent(oldPath, path string, fi os.FileInfo) loginp.FSEvent { + return loginp.FSEvent{Op: loginp.OpRename, OldPath: oldPath, NewPath: path, Info: fi} +} + +func (w *fileWatcher) deleteEvent(path string, fi os.FileInfo) loginp.FSEvent { + return loginp.FSEvent{Op: loginp.OpDelete, OldPath: path, NewPath: "", Info: fi} +} + +func (w *fileWatcher) Event() loginp.FSEvent { + return <-w.events +} + +type fileScannerConfig struct { + Paths []string + ExcludedFiles []match.Matcher + Symlinks bool + RecursiveGlob bool +} + +func defaultFileScannerConfig() fileScannerConfig { + return fileScannerConfig{ + Symlinks: false, + RecursiveGlob: true, + } +} + +func newFileScanner(paths []string, cfg fileScannerConfig) (loginp.FSScanner, error) { + fs := fileScanner{ + paths: paths, + excludedFiles: cfg.ExcludedFiles, + symlinks: cfg.Symlinks, + log: logp.NewLogger(scannerName), + } + err := fs.resolveRecursiveGlobs(cfg) + if err != nil { + return nil, err + } + err = fs.normalizeGlobPatterns() + if err != nil { + return nil, err + } + + return &fs, nil +} + +// resolveRecursiveGlobs expands `**` from the globs in multiple patterns +func (s *fileScanner) resolveRecursiveGlobs(c fileScannerConfig) error { + if !c.RecursiveGlob { + s.log.Debug("recursive glob disabled") + return nil + } + + s.log.Debug("recursive glob enabled") + var paths []string + for _, path := range s.paths { + patterns, err := file.GlobPatterns(path, recursiveGlobDepth) + if err != nil { + return err + } + if len(patterns) > 1 { + s.log.Debugf("%q expanded to %#v", path, patterns) + } + paths = append(paths, patterns...) + } + s.paths = paths + return nil +} + +// normalizeGlobPatterns calls `filepath.Abs` on all the globs from config +func (s *fileScanner) normalizeGlobPatterns() error { + var paths []string + for _, path := range s.paths { + pathAbs, err := filepath.Abs(path) + if err != nil { + return fmt.Errorf("failed to get the absolute path for %s: %v", path, err) + } + paths = append(paths, pathAbs) + } + s.paths = paths + return nil +} + +// GetFiles returns a map of files and fileinfos which +// match the configured paths. +func (s *fileScanner) GetFiles() map[string]os.FileInfo { + paths := map[string]os.FileInfo{} + + for _, path := range s.paths { + matches, err := filepath.Glob(path) + if err != nil { + s.log.Errorf("glob(%s) failed: %v", path, err) + continue + } + + OUTER: + for _, file := range matches { + if s.skippedFile(file) { + continue + } + + // If symlink is enabled, it is checked that original is not part of same input + // If original is harvested by other input, states will potentially overwrite each other + if s.isOriginalAndSymlinkConfigured(file, paths) { + continue OUTER + } + + fileInfo, err := os.Stat(file) + if err != nil { + s.log.Debug("stat(%s) failed: %s", file, err) + continue + } + paths[file] = fileInfo + } + } + + return paths +} + +func (s *fileScanner) skippedFile(file string) bool { + if s.isFileExcluded(file) { + s.log.Debugf("Exclude file: %s", file) + return true + } + + fileInfo, err := os.Lstat(file) + if err != nil { + s.log.Debugf("lstat(%s) failed: %s", file, err) + return true + } + + if fileInfo.IsDir() { + s.log.Debugf("Skipping directory: %s", file) + return true + } + + isSymlink := fileInfo.Mode()&os.ModeSymlink > 0 + if isSymlink && !s.symlinks { + s.log.Debugf("File %s skipped as it is a symlink", file) + return true + } + + return false +} + +func (s *fileScanner) isOriginalAndSymlinkConfigured(file string, paths map[string]os.FileInfo) bool { + if s.symlinks { + fileInfo, err := os.Stat(file) + if err != nil { + s.log.Debugf("stat(%s) failed: %s", file, err) + return true + } + + for _, finfo := range paths { + if os.SameFile(finfo, fileInfo) { + s.log.Info("Same file found as symlink and original. Skipping file: %s (as it same as %s)", file, finfo.Name()) + return true + } + } + } + return false +} + +func (s *fileScanner) isFileExcluded(file string) bool { + return len(s.excludedFiles) > 0 && s.matchAny(s.excludedFiles, file) +} + +// matchAny checks if the text matches any of the regular expressions +func (s *fileScanner) matchAny(matchers []match.Matcher, text string) bool { + for _, m := range matchers { + if m.MatchString(text) { + return true + } + } + return false +} diff --git a/filebeat/input/filestream/fswatch_test.go b/filebeat/input/filestream/fswatch_test.go new file mode 100644 index 00000000000..5e63987c868 --- /dev/null +++ b/filebeat/input/filestream/fswatch_test.go @@ -0,0 +1,300 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package filestream + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + + loginp "github.com/elastic/beats/v7/filebeat/input/filestream/internal/input-logfile" + "github.com/elastic/beats/v7/libbeat/common/match" + "github.com/elastic/beats/v7/libbeat/logp" +) + +func TestFileScanner(t *testing.T) { + testCases := map[string]struct { + paths []string + excludedFiles []match.Matcher + symlinks bool + expectedFiles []string + }{ + "select all files": { + paths: []string{ + filepath.Join("testdata", "excluded_file"), + filepath.Join("testdata", "included_file"), + }, + expectedFiles: []string{ + mustAbsPath(filepath.Join("testdata", "excluded_file")), + mustAbsPath(filepath.Join("testdata", "included_file")), + }, + }, + "skip excluded files": { + paths: []string{ + filepath.Join("testdata", "excluded_file"), + filepath.Join("testdata", "included_file"), + }, + excludedFiles: []match.Matcher{ + match.MustCompile(filepath.Join("testdata", "excluded_file")), + }, + expectedFiles: []string{ + mustAbsPath(filepath.Join("testdata", "included_file")), + }, + }, + // covers test_input.py/test_skip_symlinks + "skip symlinks": { + paths: []string{ + filepath.Join("testdata", "symlink_to_included_file"), + filepath.Join("testdata", "included_file"), + }, + symlinks: false, + expectedFiles: []string{ + mustAbsPath(filepath.Join("testdata", "included_file")), + }, + }, + "return a file once if symlinks are enabled": { + paths: []string{ + filepath.Join("testdata", "symlink_to_included_file"), + filepath.Join("testdata", "included_file"), + }, + symlinks: true, + expectedFiles: []string{ + mustAbsPath(filepath.Join("testdata", "included_file")), + }, + }, + "skip directories": { + paths: []string{ + filepath.Join("testdata", "unharvestable_dir"), + }, + expectedFiles: []string{}, + }, + } + + for name, test := range testCases { + test := test + + t.Run(name, func(t *testing.T) { + cfg := fileScannerConfig{ + ExcludedFiles: test.excludedFiles, + Symlinks: test.symlinks, + RecursiveGlob: false, + } + fs, err := newFileScanner(test.paths, cfg) + if err != nil { + t.Fatal(err) + } + files := fs.GetFiles() + paths := make([]string, 0) + for p, _ := range files { + paths = append(paths, p) + } + assert.Equal(t, test.expectedFiles, paths) + }) + } +} + +func TestFileWatchNewDeleteModified(t *testing.T) { + oldTs := time.Now() + newTs := oldTs.Add(5 * time.Second) + testCases := map[string]struct { + prevFiles map[string]os.FileInfo + nextFiles map[string]os.FileInfo + expectedEvents []loginp.FSEvent + }{ + "one new file": { + prevFiles: map[string]os.FileInfo{}, + nextFiles: map[string]os.FileInfo{ + "new_path": testFileInfo{"new_path", 5, oldTs}, + }, + expectedEvents: []loginp.FSEvent{ + loginp.FSEvent{Op: loginp.OpCreate, OldPath: "", NewPath: "new_path", Info: testFileInfo{"new_path", 5, oldTs}}, + }, + }, + "one deleted file": { + prevFiles: map[string]os.FileInfo{ + "old_path": testFileInfo{"old_path", 5, oldTs}, + }, + nextFiles: map[string]os.FileInfo{}, + expectedEvents: []loginp.FSEvent{ + loginp.FSEvent{Op: loginp.OpDelete, OldPath: "old_path", NewPath: "", Info: testFileInfo{"old_path", 5, oldTs}}, + }, + }, + "one modified file": { + prevFiles: map[string]os.FileInfo{ + "path": testFileInfo{"path", 5, oldTs}, + }, + nextFiles: map[string]os.FileInfo{ + "path": testFileInfo{"path", 10, newTs}, + }, + expectedEvents: []loginp.FSEvent{ + loginp.FSEvent{Op: loginp.OpWrite, OldPath: "path", NewPath: "path", Info: testFileInfo{"path", 10, newTs}}, + }, + }, + "two modified files": { + prevFiles: map[string]os.FileInfo{ + "path1": testFileInfo{"path1", 5, oldTs}, + "path2": testFileInfo{"path2", 5, oldTs}, + }, + nextFiles: map[string]os.FileInfo{ + "path1": testFileInfo{"path1", 10, newTs}, + "path2": testFileInfo{"path2", 10, newTs}, + }, + expectedEvents: []loginp.FSEvent{ + loginp.FSEvent{Op: loginp.OpWrite, OldPath: "path1", NewPath: "path1", Info: testFileInfo{"path1", 10, newTs}}, + loginp.FSEvent{Op: loginp.OpWrite, OldPath: "path2", NewPath: "path2", Info: testFileInfo{"path2", 10, newTs}}, + }, + }, + "one modified file, one new file": { + prevFiles: map[string]os.FileInfo{ + "path1": testFileInfo{"path1", 5, oldTs}, + }, + nextFiles: map[string]os.FileInfo{ + "path1": testFileInfo{"path1", 10, newTs}, + "path2": testFileInfo{"path2", 10, newTs}, + }, + expectedEvents: []loginp.FSEvent{ + loginp.FSEvent{Op: loginp.OpWrite, OldPath: "path1", NewPath: "path1", Info: testFileInfo{"path1", 10, newTs}}, + loginp.FSEvent{Op: loginp.OpCreate, OldPath: "", NewPath: "path2", Info: testFileInfo{"path2", 10, newTs}}, + }, + }, + "one new file, one deleted file": { + prevFiles: map[string]os.FileInfo{ + "path_deleted": testFileInfo{"path_deleted", 5, oldTs}, + }, + nextFiles: map[string]os.FileInfo{ + "path_new": testFileInfo{"path_new", 10, newTs}, + }, + expectedEvents: []loginp.FSEvent{ + loginp.FSEvent{Op: loginp.OpDelete, OldPath: "path_deleted", NewPath: "", Info: testFileInfo{"path_deleted", 5, oldTs}}, + loginp.FSEvent{Op: loginp.OpCreate, OldPath: "", NewPath: "path_new", Info: testFileInfo{"path_new", 10, newTs}}, + }, + }, + } + + for name, test := range testCases { + test := test + + t.Run(name, func(t *testing.T) { + w := fileWatcher{ + log: logp.L(), + prev: test.prevFiles, + scanner: &mockScanner{test.nextFiles}, + events: make(chan loginp.FSEvent), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go w.watch(ctx) + + for _, expectedEvent := range test.expectedEvents { + evt := w.Event() + assert.Equal(t, expectedEvent, evt) + } + }) + } +} + +func TestFileWatcherRenamedFile(t *testing.T) { + testPath := mustAbsPath(filepath.Join("testdata", "first_name")) + renamedPath := mustAbsPath(filepath.Join("testdata", "renamed")) + + f, err := os.Create(testPath) + if err != nil { + t.Fatal(err) + } + f.Close() + fi, err := os.Stat(testPath) + if err != nil { + t.Fatal(err) + } + + cfg := fileScannerConfig{ + ExcludedFiles: nil, + Symlinks: false, + RecursiveGlob: false, + } + scanner, err := newFileScanner([]string{testPath, renamedPath}, cfg) + if err != nil { + t.Fatal(err) + } + w := fileWatcher{ + log: logp.L(), + scanner: scanner, + events: make(chan loginp.FSEvent), + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go w.watch(ctx) + assert.Equal(t, loginp.FSEvent{Op: loginp.OpCreate, OldPath: "", NewPath: testPath, Info: fi}, w.Event()) + + err = os.Rename(testPath, renamedPath) + if err != nil { + t.Fatal(err) + } + defer os.Remove(renamedPath) + fi, err = os.Stat(renamedPath) + if err != nil { + t.Fatal(err) + } + + go w.watch(ctx) + assert.Equal(t, loginp.FSEvent{Op: loginp.OpRename, OldPath: testPath, NewPath: renamedPath, Info: fi}, w.Event()) +} + +type mockScanner struct { + files map[string]os.FileInfo +} + +func (m *mockScanner) GetFiles() map[string]os.FileInfo { + return m.files +} + +type testFileInfo struct { + path string + size int64 + time time.Time +} + +func (t testFileInfo) Name() string { return t.path } +func (t testFileInfo) Size() int64 { return t.size } +func (t testFileInfo) Mode() os.FileMode { return 0 } +func (t testFileInfo) ModTime() time.Time { return t.time } +func (t testFileInfo) IsDir() bool { return false } +func (t testFileInfo) Sys() interface{} { return nil } + +func mustAbsPath(path string) string { + p, err := filepath.Abs(path) + if err != nil { + panic(err) + } + return p +} + +func mustDuration(durStr string) time.Duration { + dur, err := time.ParseDuration(durStr) + if err != nil { + panic(err) + } + return dur +} diff --git a/filebeat/input/filestream/testdata/excluded_file b/filebeat/input/filestream/testdata/excluded_file new file mode 100644 index 00000000000..e69de29bb2d diff --git a/filebeat/input/filestream/testdata/included_file b/filebeat/input/filestream/testdata/included_file new file mode 100644 index 00000000000..e69de29bb2d diff --git a/filebeat/input/filestream/testdata/symlink_to_included_file b/filebeat/input/filestream/testdata/symlink_to_included_file new file mode 120000 index 00000000000..40824f3f7d3 --- /dev/null +++ b/filebeat/input/filestream/testdata/symlink_to_included_file @@ -0,0 +1 @@ +filebeat/input/filestream/testdata/included_file \ No newline at end of file