ILIAS  release_5-4 Revision v5.4.26-12-gabc799a52e6
Delimiter.php
Go to the documentation of this file.
1 <?php
2 
4 
5 class Delimiter
6 {
7  protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~'];
8 
10  protected $fileHandle;
11 
13  protected $escapeCharacter;
14 
16  protected $enclosure;
17 
19  protected $counts = [];
20 
22  protected $numberLines = 0;
23 
25  protected $delimiter;
26 
30  public function __construct($fileHandle, string $escapeCharacter, string $enclosure)
31  {
32  $this->fileHandle = $fileHandle;
33  $this->escapeCharacter = $escapeCharacter;
34  $this->enclosure = $enclosure;
35 
36  $this->countPotentialDelimiters();
37  }
38 
39  public function getDefaultDelimiter(): string
40  {
41  return self::POTENTIAL_DELIMETERS[0];
42  }
43 
44  public function linesCounted(): int
45  {
46  return $this->numberLines;
47  }
48 
49  protected function countPotentialDelimiters(): void
50  {
51  $this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []);
52  $delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS);
53 
54  // Count how many times each of the potential delimiters appears in each line
55  $this->numberLines = 0;
56  while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) {
57  $this->countDelimiterValues($line, $delimiterKeys);
58  }
59  }
60 
61  protected function countDelimiterValues(string $line, array $delimiterKeys): void
62  {
63  $splitString = str_split($line, 1);
64  if (is_array($splitString)) {
65  $distribution = array_count_values($splitString);
66  $countLine = array_intersect_key($distribution, $delimiterKeys);
67 
68  foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
69  $this->counts[$delimiter][] = $countLine[$delimiter] ?? 0;
70  }
71  }
72  }
73 
74  public function infer(): ?string
75  {
76  // Calculate the mean square deviations for each delimiter
77  // (ignoring delimiters that haven't been found consistently)
78  $meanSquareDeviations = [];
79  $middleIdx = floor(($this->numberLines - 1) / 2);
80 
81  foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
82  $series = $this->counts[$delimiter];
83  sort($series);
84 
85  $median = ($this->numberLines % 2)
86  ? $series[$middleIdx]
87  : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
88 
89  if ($median === 0) {
90  continue;
91  }
92 
93  $meanSquareDeviations[$delimiter] = array_reduce(
94  $series,
95  function ($sum, $value) use ($median) {
96  return $sum + ($value - $median) ** 2;
97  }
98  ) / count($series);
99  }
100 
101  // ... and pick the delimiter with the smallest mean square deviation
102  // (in case of ties, the order in potentialDelimiters is respected)
103  $min = INF;
104  foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
105  if (!isset($meanSquareDeviations[$delimiter])) {
106  continue;
107  }
108 
109  if ($meanSquareDeviations[$delimiter] < $min) {
110  $min = $meanSquareDeviations[$delimiter];
111  $this->delimiter = $delimiter;
112  }
113  }
114 
115  return $this->delimiter;
116  }
117 
123  public function getNextLine()
124  {
125  $line = '';
126  $enclosure = ($this->escapeCharacter === '' ? ''
127  : ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
128  . preg_quote($this->enclosure, '/');
129 
130  do {
131  // Get the next line in the file
132  $newLine = fgets($this->fileHandle);
133 
134  // Return false if there is no next line
135  if ($newLine === false) {
136  return false;
137  }
138 
139  // Add the new line to the line passed in
140  $line = $line . $newLine;
141 
142  // Drop everything that is enclosed to avoid counting false positives in enclosures
143  $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
144 
145  // See if we have any enclosures left in the line
146  // if we still have an enclosure then we need to read the next line as well
147  } while (preg_match('/(' . $enclosure . ')/', $line ?? '') > 0);
148 
149  return $line ?? false;
150  }
151 }
__construct($fileHandle, string $escapeCharacter, string $enclosure)
Definition: Delimiter.php:30
countDelimiterValues(string $line, array $delimiterKeys)
Definition: Delimiter.php:61
getNextLine()
Get the next full line from the file.
Definition: Delimiter.php:123