ILIAS  release_5-4 Revision v5.4.26-12-gabc799a52e6
Delimiter.php
Go to the documentation of this file.
1<?php
2
4
6{
7 protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~'];
8
10 protected $fileHandle;
11
14
16 protected $enclosure;
17
19 protected $counts = [];
20
22 protected $numberLines = 0;
23
25 protected $delimiter;
26
30 public function __construct($fileHandle, string $escapeCharacter, string $enclosure)
31 {
32 $this->fileHandle = $fileHandle;
33 $this->escapeCharacter = $escapeCharacter;
34 $this->enclosure = $enclosure;
35
37 }
38
39 public function getDefaultDelimiter(): string
40 {
41 return self::POTENTIAL_DELIMETERS[0];
42 }
43
44 public function linesCounted(): int
45 {
46 return $this->numberLines;
47 }
48
49 protected function countPotentialDelimiters(): void
50 {
51 $this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []);
52 $delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS);
53
54 // Count how many times each of the potential delimiters appears in each line
55 $this->numberLines = 0;
56 while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) {
57 $this->countDelimiterValues($line, $delimiterKeys);
58 }
59 }
60
61 protected function countDelimiterValues(string $line, array $delimiterKeys): void
62 {
63 $splitString = str_split($line, 1);
64 if (is_array($splitString)) {
65 $distribution = array_count_values($splitString);
66 $countLine = array_intersect_key($distribution, $delimiterKeys);
67
68 foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
69 $this->counts[$delimiter][] = $countLine[$delimiter] ?? 0;
70 }
71 }
72 }
73
74 public function infer(): ?string
75 {
76 // Calculate the mean square deviations for each delimiter
77 // (ignoring delimiters that haven't been found consistently)
78 $meanSquareDeviations = [];
79 $middleIdx = floor(($this->numberLines - 1) / 2);
80
81 foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
82 $series = $this->counts[$delimiter];
83 sort($series);
84
85 $median = ($this->numberLines % 2)
86 ? $series[$middleIdx]
87 : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
88
89 if ($median === 0) {
90 continue;
91 }
92
93 $meanSquareDeviations[$delimiter] = array_reduce(
94 $series,
95 function ($sum, $value) use ($median) {
96 return $sum + ($value - $median) ** 2;
97 }
98 ) / count($series);
99 }
100
101 // ... and pick the delimiter with the smallest mean square deviation
102 // (in case of ties, the order in potentialDelimiters is respected)
103 $min = INF;
104 foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
105 if (!isset($meanSquareDeviations[$delimiter])) {
106 continue;
107 }
108
109 if ($meanSquareDeviations[$delimiter] < $min) {
110 $min = $meanSquareDeviations[$delimiter];
111 $this->delimiter = $delimiter;
112 }
113 }
114
115 return $this->delimiter;
116 }
117
123 public function getNextLine()
124 {
125 $line = '';
126 $enclosure = ($this->escapeCharacter === '' ? ''
127 : ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
128 . preg_quote($this->enclosure, '/');
129
130 do {
131 // Get the next line in the file
132 $newLine = fgets($this->fileHandle);
133
134 // Return false if there is no next line
135 if ($newLine === false) {
136 return false;
137 }
138
139 // Add the new line to the line passed in
140 $line = $line . $newLine;
141
142 // Drop everything that is enclosed to avoid counting false positives in enclosures
143 $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
144
145 // See if we have any enclosures left in the line
146 // if we still have an enclosure then we need to read the next line as well
147 } while (preg_match('/(' . $enclosure . ')/', $line ?? '') > 0);
148
149 return $line ?? false;
150 }
151}
An exception for terminatinating execution or to throw for unit testing.
getNextLine()
Get the next full line from the file.
Definition: Delimiter.php:123
countDelimiterValues(string $line, array $delimiterKeys)
Definition: Delimiter.php:61
__construct($fileHandle, string $escapeCharacter, string $enclosure)
Definition: Delimiter.php:30