Csv.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. <?php
  2. namespace PhpOffice\PhpSpreadsheet\Reader;
  3. use InvalidArgumentException;
  4. use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
  5. use PhpOffice\PhpSpreadsheet\Reader\Csv\Delimiter;
  6. use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
  7. use PhpOffice\PhpSpreadsheet\Spreadsheet;
  8. class Csv extends BaseReader
  9. {
  10. const DEFAULT_FALLBACK_ENCODING = 'CP1252';
  11. const GUESS_ENCODING = 'guess';
  12. const UTF8_BOM = "\xEF\xBB\xBF";
  13. const UTF8_BOM_LEN = 3;
  14. const UTF16BE_BOM = "\xfe\xff";
  15. const UTF16BE_BOM_LEN = 2;
  16. const UTF16BE_LF = "\x00\x0a";
  17. const UTF16LE_BOM = "\xff\xfe";
  18. const UTF16LE_BOM_LEN = 2;
  19. const UTF16LE_LF = "\x0a\x00";
  20. const UTF32BE_BOM = "\x00\x00\xfe\xff";
  21. const UTF32BE_BOM_LEN = 4;
  22. const UTF32BE_LF = "\x00\x00\x00\x0a";
  23. const UTF32LE_BOM = "\xff\xfe\x00\x00";
  24. const UTF32LE_BOM_LEN = 4;
  25. const UTF32LE_LF = "\x0a\x00\x00\x00";
  26. /**
  27. * Input encoding.
  28. *
  29. * @var string
  30. */
  31. private $inputEncoding = 'UTF-8';
  32. /**
  33. * Fallback encoding if 'guess' strikes out.
  34. *
  35. * @var string
  36. */
  37. private $fallbackEncoding = self::DEFAULT_FALLBACK_ENCODING;
  38. /**
  39. * Delimiter.
  40. *
  41. * @var ?string
  42. */
  43. private $delimiter;
  44. /**
  45. * Enclosure.
  46. *
  47. * @var string
  48. */
  49. private $enclosure = '"';
  50. /**
  51. * Sheet index to read.
  52. *
  53. * @var int
  54. */
  55. private $sheetIndex = 0;
  56. /**
  57. * Load rows contiguously.
  58. *
  59. * @var bool
  60. */
  61. private $contiguous = false;
  62. /**
  63. * The character that can escape the enclosure.
  64. *
  65. * @var string
  66. */
  67. private $escapeCharacter = '\\';
  68. /**
  69. * Callback for setting defaults in construction.
  70. *
  71. * @var ?callable
  72. */
  73. private static $constructorCallback;
  74. /**
  75. * Create a new CSV Reader instance.
  76. */
  77. public function __construct()
  78. {
  79. parent::__construct();
  80. $callback = self::$constructorCallback;
  81. if ($callback !== null) {
  82. $callback($this);
  83. }
  84. }
  85. /**
  86. * Set a callback to change the defaults.
  87. *
  88. * The callback must accept the Csv Reader object as the first parameter,
  89. * and it should return void.
  90. */
  91. public static function setConstructorCallback(?callable $callback): void
  92. {
  93. self::$constructorCallback = $callback;
  94. }
  95. public static function getConstructorCallback(): ?callable
  96. {
  97. return self::$constructorCallback;
  98. }
  99. public function setInputEncoding(string $pValue): self
  100. {
  101. $this->inputEncoding = $pValue;
  102. return $this;
  103. }
  104. public function getInputEncoding(): string
  105. {
  106. return $this->inputEncoding;
  107. }
  108. public function setFallbackEncoding(string $pValue): self
  109. {
  110. $this->fallbackEncoding = $pValue;
  111. return $this;
  112. }
  113. public function getFallbackEncoding(): string
  114. {
  115. return $this->fallbackEncoding;
  116. }
  117. /**
  118. * Move filepointer past any BOM marker.
  119. */
  120. protected function skipBOM(): void
  121. {
  122. rewind($this->fileHandle);
  123. if (fgets($this->fileHandle, self::UTF8_BOM_LEN + 1) !== self::UTF8_BOM) {
  124. rewind($this->fileHandle);
  125. }
  126. }
  127. /**
  128. * Identify any separator that is explicitly set in the file.
  129. */
  130. protected function checkSeparator(): void
  131. {
  132. $line = fgets($this->fileHandle);
  133. if ($line === false) {
  134. return;
  135. }
  136. if ((strlen(trim($line, "\r\n")) == 5) && (stripos($line, 'sep=') === 0)) {
  137. $this->delimiter = substr($line, 4, 1);
  138. return;
  139. }
  140. $this->skipBOM();
  141. }
  142. /**
  143. * Infer the separator if it isn't explicitly set in the file or specified by the user.
  144. */
  145. protected function inferSeparator(): void
  146. {
  147. if ($this->delimiter !== null) {
  148. return;
  149. }
  150. $inferenceEngine = new Delimiter($this->fileHandle, $this->escapeCharacter, $this->enclosure);
  151. // If number of lines is 0, nothing to infer : fall back to the default
  152. if ($inferenceEngine->linesCounted() === 0) {
  153. $this->delimiter = $inferenceEngine->getDefaultDelimiter();
  154. $this->skipBOM();
  155. return;
  156. }
  157. $this->delimiter = $inferenceEngine->infer();
  158. // If no delimiter could be detected, fall back to the default
  159. if ($this->delimiter === null) {
  160. $this->delimiter = $inferenceEngine->getDefaultDelimiter();
  161. }
  162. $this->skipBOM();
  163. }
  164. /**
  165. * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
  166. */
  167. public function listWorksheetInfo(string $pFilename): array
  168. {
  169. // Open file
  170. $this->openFileOrMemory($pFilename);
  171. $fileHandle = $this->fileHandle;
  172. // Skip BOM, if any
  173. $this->skipBOM();
  174. $this->checkSeparator();
  175. $this->inferSeparator();
  176. $worksheetInfo = [];
  177. $worksheetInfo[0]['worksheetName'] = 'Worksheet';
  178. $worksheetInfo[0]['lastColumnLetter'] = 'A';
  179. $worksheetInfo[0]['lastColumnIndex'] = 0;
  180. $worksheetInfo[0]['totalRows'] = 0;
  181. $worksheetInfo[0]['totalColumns'] = 0;
  182. // Loop through each line of the file in turn
  183. $rowData = fgetcsv($fileHandle, 0, $this->delimiter ?? '', $this->enclosure, $this->escapeCharacter);
  184. while (is_array($rowData)) {
  185. ++$worksheetInfo[0]['totalRows'];
  186. $worksheetInfo[0]['lastColumnIndex'] = max($worksheetInfo[0]['lastColumnIndex'], count($rowData) - 1);
  187. $rowData = fgetcsv($fileHandle, 0, $this->delimiter ?? '', $this->enclosure, $this->escapeCharacter);
  188. }
  189. $worksheetInfo[0]['lastColumnLetter'] = Coordinate::stringFromColumnIndex($worksheetInfo[0]['lastColumnIndex'] + 1);
  190. $worksheetInfo[0]['totalColumns'] = $worksheetInfo[0]['lastColumnIndex'] + 1;
  191. // Close file
  192. fclose($fileHandle);
  193. return $worksheetInfo;
  194. }
  195. /**
  196. * Loads Spreadsheet from file.
  197. *
  198. * @param string $pFilename
  199. *
  200. * @return Spreadsheet
  201. */
  202. public function load($pFilename)
  203. {
  204. // Create new Spreadsheet
  205. $spreadsheet = new Spreadsheet();
  206. // Load into this instance
  207. return $this->loadIntoExisting($pFilename, $spreadsheet);
  208. }
  209. private function openFileOrMemory(string $pFilename): void
  210. {
  211. // Open file
  212. $fhandle = $this->canRead($pFilename);
  213. if (!$fhandle) {
  214. throw new Exception($pFilename . ' is an Invalid Spreadsheet file.');
  215. }
  216. if ($this->inputEncoding === self::GUESS_ENCODING) {
  217. $this->inputEncoding = self::guessEncoding($pFilename, $this->fallbackEncoding);
  218. }
  219. $this->openFile($pFilename);
  220. if ($this->inputEncoding !== 'UTF-8') {
  221. fclose($this->fileHandle);
  222. $entireFile = file_get_contents($pFilename);
  223. $this->fileHandle = fopen('php://memory', 'r+b');
  224. if ($this->fileHandle !== false && $entireFile !== false) {
  225. $data = StringHelper::convertEncoding($entireFile, 'UTF-8', $this->inputEncoding);
  226. fwrite($this->fileHandle, $data);
  227. $this->skipBOM();
  228. }
  229. }
  230. }
  231. /**
  232. * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
  233. */
  234. public function loadIntoExisting(string $pFilename, Spreadsheet $spreadsheet): Spreadsheet
  235. {
  236. $lineEnding = ini_get('auto_detect_line_endings') ?: '0';
  237. ini_set('auto_detect_line_endings', '1');
  238. // Open file
  239. $this->openFileOrMemory($pFilename);
  240. $fileHandle = $this->fileHandle;
  241. // Skip BOM, if any
  242. $this->skipBOM();
  243. $this->checkSeparator();
  244. $this->inferSeparator();
  245. // Create new PhpSpreadsheet object
  246. while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
  247. $spreadsheet->createSheet();
  248. }
  249. $sheet = $spreadsheet->setActiveSheetIndex($this->sheetIndex);
  250. // Set our starting row based on whether we're in contiguous mode or not
  251. $currentRow = 1;
  252. $outRow = 0;
  253. // Loop through each line of the file in turn
  254. $rowData = fgetcsv($fileHandle, 0, $this->delimiter ?? '', $this->enclosure, $this->escapeCharacter);
  255. while (is_array($rowData)) {
  256. $noOutputYet = true;
  257. $columnLetter = 'A';
  258. foreach ($rowData as $rowDatum) {
  259. if ($rowDatum != '' && $this->readFilter->readCell($columnLetter, $currentRow)) {
  260. if ($this->contiguous) {
  261. if ($noOutputYet) {
  262. $noOutputYet = false;
  263. ++$outRow;
  264. }
  265. } else {
  266. $outRow = $currentRow;
  267. }
  268. // Set cell value
  269. $sheet->getCell($columnLetter . $outRow)->setValue($rowDatum);
  270. }
  271. ++$columnLetter;
  272. }
  273. $rowData = fgetcsv($fileHandle, 0, $this->delimiter ?? '', $this->enclosure, $this->escapeCharacter);
  274. ++$currentRow;
  275. }
  276. // Close file
  277. fclose($fileHandle);
  278. ini_set('auto_detect_line_endings', $lineEnding);
  279. // Return
  280. return $spreadsheet;
  281. }
  282. public function getDelimiter(): ?string
  283. {
  284. return $this->delimiter;
  285. }
  286. public function setDelimiter(string $delimiter): self
  287. {
  288. $this->delimiter = $delimiter;
  289. return $this;
  290. }
  291. public function getEnclosure(): string
  292. {
  293. return $this->enclosure;
  294. }
  295. public function setEnclosure(string $enclosure): self
  296. {
  297. if ($enclosure == '') {
  298. $enclosure = '"';
  299. }
  300. $this->enclosure = $enclosure;
  301. return $this;
  302. }
  303. public function getSheetIndex(): int
  304. {
  305. return $this->sheetIndex;
  306. }
  307. public function setSheetIndex(int $pValue): self
  308. {
  309. $this->sheetIndex = $pValue;
  310. return $this;
  311. }
  312. public function setContiguous(bool $contiguous): self
  313. {
  314. $this->contiguous = (bool) $contiguous;
  315. return $this;
  316. }
  317. public function getContiguous(): bool
  318. {
  319. return $this->contiguous;
  320. }
  321. public function setEscapeCharacter(string $escapeCharacter): self
  322. {
  323. $this->escapeCharacter = $escapeCharacter;
  324. return $this;
  325. }
  326. public function getEscapeCharacter(): string
  327. {
  328. return $this->escapeCharacter;
  329. }
  330. /**
  331. * Scrutinizer believes, incorrectly, that the specific pathinfo
  332. * call in canRead can return something other than an array.
  333. * Phpstan knows better.
  334. * This function satisfies both.
  335. *
  336. * @param mixed $extension
  337. */
  338. private static function extractStringLower($extension): string
  339. {
  340. return is_string($extension) ? strtolower($extension) : '';
  341. }
  342. /**
  343. * Can the current IReader read the file?
  344. *
  345. * @param string $pFilename
  346. *
  347. * @return bool
  348. */
  349. public function canRead($pFilename)
  350. {
  351. // Check if file exists
  352. try {
  353. $this->openFile($pFilename);
  354. } catch (InvalidArgumentException $e) {
  355. return false;
  356. }
  357. fclose($this->fileHandle);
  358. // Trust file extension if any
  359. $extension = self::extractStringLower(pathinfo($pFilename, PATHINFO_EXTENSION));
  360. if (in_array($extension, ['csv', 'tsv'])) {
  361. return true;
  362. }
  363. // Attempt to guess mimetype
  364. $type = mime_content_type($pFilename);
  365. $supportedTypes = [
  366. 'application/csv',
  367. 'text/csv',
  368. 'text/plain',
  369. 'inode/x-empty',
  370. ];
  371. return in_array($type, $supportedTypes, true);
  372. }
  373. private static function guessEncodingTestNoBom(string &$encoding, string &$contents, string $compare, string $setEncoding): void
  374. {
  375. if ($encoding === '') {
  376. $pos = strpos($contents, $compare);
  377. if ($pos !== false && $pos % strlen($compare) === 0) {
  378. $encoding = $setEncoding;
  379. }
  380. }
  381. }
  382. private static function guessEncodingNoBom(string $filename): string
  383. {
  384. $encoding = '';
  385. $contents = file_get_contents($filename);
  386. self::guessEncodingTestNoBom($encoding, $contents, self::UTF32BE_LF, 'UTF-32BE');
  387. self::guessEncodingTestNoBom($encoding, $contents, self::UTF32LE_LF, 'UTF-32LE');
  388. self::guessEncodingTestNoBom($encoding, $contents, self::UTF16BE_LF, 'UTF-16BE');
  389. self::guessEncodingTestNoBom($encoding, $contents, self::UTF16LE_LF, 'UTF-16LE');
  390. if ($encoding === '' && preg_match('//u', $contents) === 1) {
  391. $encoding = 'UTF-8';
  392. }
  393. return $encoding;
  394. }
  395. private static function guessEncodingTestBom(string &$encoding, string $first4, string $compare, string $setEncoding): void
  396. {
  397. if ($encoding === '') {
  398. if ($compare === substr($first4, 0, strlen($compare))) {
  399. $encoding = $setEncoding;
  400. }
  401. }
  402. }
  403. private static function guessEncodingBom(string $filename): string
  404. {
  405. $encoding = '';
  406. $first4 = file_get_contents($filename, false, null, 0, 4);
  407. if ($first4 !== false) {
  408. self::guessEncodingTestBom($encoding, $first4, self::UTF8_BOM, 'UTF-8');
  409. self::guessEncodingTestBom($encoding, $first4, self::UTF16BE_BOM, 'UTF-16BE');
  410. self::guessEncodingTestBom($encoding, $first4, self::UTF32BE_BOM, 'UTF-32BE');
  411. self::guessEncodingTestBom($encoding, $first4, self::UTF32LE_BOM, 'UTF-32LE');
  412. self::guessEncodingTestBom($encoding, $first4, self::UTF16LE_BOM, 'UTF-16LE');
  413. }
  414. return $encoding;
  415. }
  416. public static function guessEncoding(string $filename, string $dflt = self::DEFAULT_FALLBACK_ENCODING): string
  417. {
  418. $encoding = self::guessEncodingBom($filename);
  419. if ($encoding === '') {
  420. $encoding = self::guessEncodingNoBom($filename);
  421. }
  422. return ($encoding === '') ? $dflt : $encoding;
  423. }
  424. }