mangadex-home-rs/src/cache/fs.rs

290 lines
10 KiB
Rust
Raw Normal View History

2021-04-18 23:14:36 +00:00
use actix_web::error::PayloadError;
2021-04-23 04:11:30 +00:00
use bytes::{Buf, Bytes, BytesMut};
2021-04-23 01:21:32 +00:00
use futures::{Future, Stream, StreamExt};
2021-04-23 01:34:31 +00:00
use log::debug;
2021-04-18 21:06:40 +00:00
use once_cell::sync::Lazy;
2021-04-22 16:44:02 +00:00
use serde::Deserialize;
2021-04-22 17:11:08 +00:00
use std::collections::HashMap;
2021-04-19 03:06:18 +00:00
use std::fmt::Display;
2021-04-22 23:22:34 +00:00
use std::io::SeekFrom;
2021-04-23 01:21:32 +00:00
use std::num::NonZeroU32;
2021-04-10 02:00:59 +00:00
use std::path::{Path, PathBuf};
use std::pin::Pin;
2021-04-10 01:59:29 +00:00
use std::task::{Context, Poll};
2021-04-19 03:06:18 +00:00
use tokio::fs::{create_dir_all, remove_file, File};
2021-04-23 00:01:11 +00:00
use tokio::io::{AsyncRead, AsyncSeekExt, AsyncWriteExt, BufReader, ReadBuf};
2021-04-23 04:11:30 +00:00
use tokio::sync::mpsc::Sender;
2021-04-20 02:01:32 +00:00
use tokio::sync::watch::{channel, Receiver};
2021-04-18 03:19:27 +00:00
use tokio::sync::RwLock;
2021-04-20 02:01:32 +00:00
use tokio_stream::wrappers::WatchStream;
2021-04-19 03:06:18 +00:00
use tokio_util::codec::{BytesCodec, FramedRead};
2021-04-10 01:59:29 +00:00
2021-04-23 04:11:30 +00:00
use super::{BoxedImageStream, CacheKey, CacheStream, CacheStreamItem, ImageMetadata};
2021-04-18 21:06:40 +00:00
2021-04-10 03:15:44 +00:00
/// Keeps track of files that are currently being written to.
///
/// Why is this necessary? Consider the following situation:
///
/// Client A requests file `foo.png`. We construct a transparent file stream,
/// and now the file is being streamed into and from.
///
/// Client B requests the same file `foo.png`. A naive implementation would
/// attempt to either read directly the file as it sees the file existing. This
/// is problematic as the file could still be written to. If Client B catches
/// up to Client A's request, then Client B could receive a broken image, as it
/// thinks it's done reading the file.
///
2021-04-15 02:52:54 +00:00
/// We effectively use `WRITING_STATUS` as a status relay to ensure concurrent
2021-04-10 03:15:44 +00:00
/// reads to the file while it's being written to will wait for writing to be
/// completed.
2021-04-20 02:01:32 +00:00
static WRITING_STATUS: Lazy<RwLock<HashMap<PathBuf, Receiver<WritingStatus>>>> =
2021-04-10 01:59:29 +00:00
Lazy::new(|| RwLock::new(HashMap::new()));
2021-04-10 03:15:44 +00:00
/// Tries to read from the file, returning a byte stream if it exists
2021-04-22 16:44:02 +00:00
pub async fn read_file(
path: &Path,
) -> Option<Result<(CacheStream, ImageMetadata), std::io::Error>> {
2021-04-22 17:11:08 +00:00
let std_file = std::fs::File::open(path).ok()?;
let file = File::from_std(std_file.try_clone().ok()?);
2021-04-22 16:44:02 +00:00
let metadata = {
let mut de = serde_json::Deserializer::from_reader(std_file);
ImageMetadata::deserialize(&mut de).ok()?
};
2021-04-22 17:40:19 +00:00
// False positive, `file` is used in both cases, which means that it's not
// possible to move this into a map_or_else without cloning `file`.
#[allow(clippy::option_if_let_else)]
2021-04-22 16:44:02 +00:00
let stream = if let Some(status) = WRITING_STATUS.read().await.get(path).map(Clone::clone) {
CacheStream::Concurrent(ConcurrentFsStream::from_file(
file,
WatchStream::new(status),
))
2021-04-10 03:15:44 +00:00
} else {
2021-04-23 00:01:11 +00:00
CacheStream::Completed(FramedRead::new(BufReader::new(file), BytesCodec::new()))
2021-04-22 16:44:02 +00:00
};
Some(Ok((stream, metadata)))
2021-04-10 03:15:44 +00:00
}
/// Maps the input byte stream into one that writes to disk instead, returning
/// a stream that reads from disk instead.
2021-04-23 01:21:32 +00:00
pub async fn write_file<
Fut: 'static + Send + Sync + Future<Output = ()>,
2021-04-23 04:11:30 +00:00
DbCallback: 'static + Send + Sync + FnOnce(u32) -> Fut,
2021-04-23 01:21:32 +00:00
>(
2021-04-10 01:59:29 +00:00
path: &Path,
2021-04-23 04:11:30 +00:00
cache_key: CacheKey,
2021-04-18 21:06:40 +00:00
mut byte_stream: BoxedImageStream,
2021-04-22 16:44:02 +00:00
metadata: ImageMetadata,
2021-04-23 04:11:30 +00:00
db_callback: DbCallback,
on_complete: Option<Sender<(CacheKey, Bytes, ImageMetadata, usize)>>,
2021-04-19 03:06:18 +00:00
) -> Result<CacheStream, std::io::Error> {
2021-04-20 02:01:32 +00:00
let (tx, rx) = channel(WritingStatus::NotDone);
2021-04-10 03:15:44 +00:00
let mut file = {
2021-04-18 03:19:27 +00:00
let mut write_lock = WRITING_STATUS.write().await;
2021-04-19 03:06:18 +00:00
let parent = path.parent().unwrap();
create_dir_all(parent).await?;
2021-04-22 16:44:02 +00:00
let file = File::create(path).await?; // we need to make sure the file exists and is truncated.
2021-04-20 02:01:32 +00:00
write_lock.insert(path.to_path_buf(), rx.clone());
2021-04-10 03:15:44 +00:00
file
};
2021-04-23 04:11:30 +00:00
let metadata_string = serde_json::to_string(&metadata).unwrap();
let metadata_size = metadata_string.len();
2021-04-10 03:15:44 +00:00
// need owned variant because async lifetime
let path_buf = path.to_path_buf();
tokio::spawn(async move {
let path_buf = path_buf; // moves path buf into async
2021-04-15 02:52:54 +00:00
let mut errored = false;
2021-04-23 01:21:32 +00:00
let mut bytes_written: u32 = 0;
2021-04-23 04:11:30 +00:00
let mut acc_bytes = BytesMut::new();
let accumulate = on_complete.is_some();
file.write_all(metadata_string.as_bytes()).await?;
2021-04-10 03:15:44 +00:00
while let Some(bytes) = byte_stream.next().await {
2021-04-20 02:01:32 +00:00
if let Ok(mut bytes) = bytes {
2021-04-23 04:11:30 +00:00
if accumulate {
acc_bytes.extend(&bytes);
}
2021-04-20 02:01:32 +00:00
loop {
match file.write(&bytes).await? {
0 => break,
n => {
bytes.advance(n);
// We don't care if we don't have receivers
2021-04-23 01:21:32 +00:00
bytes_written += n as u32;
2021-04-20 02:01:32 +00:00
let _ = tx.send(WritingStatus::NotDone);
}
}
}
2021-04-15 02:52:54 +00:00
} else {
errored = true;
break;
2021-04-10 01:59:29 +00:00
}
2021-04-10 03:15:44 +00:00
}
2021-04-10 01:59:29 +00:00
2021-04-15 02:52:54 +00:00
if errored {
2021-04-10 03:15:44 +00:00
// It's ok if the deleting the file fails, since we truncate on
2021-04-15 02:11:00 +00:00
// create anyways, but it should be best effort
2021-04-10 03:15:44 +00:00
let _ = remove_file(&path_buf).await;
} else {
file.flush().await?;
file.sync_all().await?; // we need metadata
2021-04-19 03:06:18 +00:00
debug!("writing to file done");
2021-04-10 03:15:44 +00:00
}
2021-04-10 01:59:29 +00:00
2021-04-22 16:44:02 +00:00
{
let mut write_lock = WRITING_STATUS.write().await;
// This needs to be written atomically with the write lock, else
// it's possible we have an inconsistent state
//
// We don't really care if we have no receivers
if errored {
let _ = tx.send(WritingStatus::Error);
} else {
let _ = tx.send(WritingStatus::Done(bytes_written));
}
write_lock.remove(&path_buf);
2021-04-10 03:15:44 +00:00
}
2021-04-10 01:59:29 +00:00
2021-04-23 01:21:32 +00:00
tokio::spawn(db_callback(bytes_written));
2021-04-23 04:11:30 +00:00
if accumulate {
tokio::spawn(async move {
let sender = on_complete.unwrap();
sender
.send((
cache_key,
acc_bytes.freeze(),
metadata,
bytes_written as usize,
))
.await
});
}
2021-04-23 01:21:32 +00:00
2021-04-10 03:15:44 +00:00
// We don't ever check this, so the return value doesn't matter
Ok::<_, std::io::Error>(())
});
2021-04-10 01:59:29 +00:00
2021-04-19 03:06:18 +00:00
Ok(CacheStream::Concurrent(
2021-04-22 23:22:34 +00:00
ConcurrentFsStream::new(path, metadata_size, WatchStream::new(rx)).await?,
2021-04-19 03:06:18 +00:00
))
2021-04-10 01:59:29 +00:00
}
2021-04-19 03:06:18 +00:00
pub struct ConcurrentFsStream {
2021-04-23 00:01:11 +00:00
file: Pin<Box<BufReader<File>>>,
2021-04-20 02:01:32 +00:00
receiver: Pin<Box<WatchStream<WritingStatus>>>,
2021-04-23 01:21:32 +00:00
bytes_read: u32,
bytes_total: Option<NonZeroU32>,
2021-04-10 01:59:29 +00:00
}
2021-04-19 03:06:18 +00:00
impl ConcurrentFsStream {
2021-04-20 02:01:32 +00:00
async fn new(
path: &Path,
2021-04-22 23:22:34 +00:00
seek: usize,
2021-04-20 02:01:32 +00:00
receiver: WatchStream<WritingStatus>,
) -> Result<Self, std::io::Error> {
2021-04-22 23:22:34 +00:00
let mut file = File::open(path).await?;
file.seek(SeekFrom::Start(seek as u64)).await?;
Ok(Self::from_file(file, receiver))
2021-04-22 16:44:02 +00:00
}
fn from_file(file: File, receiver: WatchStream<WritingStatus>) -> Self {
Self {
2021-04-23 00:01:11 +00:00
file: Box::pin(BufReader::new(file)),
2021-04-20 02:01:32 +00:00
receiver: Box::pin(receiver),
bytes_read: 0,
bytes_total: None,
2021-04-22 16:44:02 +00:00
}
2021-04-10 01:59:29 +00:00
}
}
/// Represents some upstream error.
2021-04-18 21:06:40 +00:00
#[derive(Debug)]
2021-04-10 02:00:09 +00:00
pub struct UpstreamError;
2021-04-10 01:59:29 +00:00
2021-04-18 21:06:40 +00:00
impl std::error::Error for UpstreamError {}
impl Display for UpstreamError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "An upstream error occurred")
}
}
2021-04-19 03:06:18 +00:00
impl Stream for ConcurrentFsStream {
2021-04-18 21:06:40 +00:00
type Item = CacheStreamItem;
2021-04-10 01:59:29 +00:00
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
// First, try to read from the file...
2021-04-20 21:48:04 +00:00
// TODO: Might be more efficient to have a larger buffer
2021-04-22 16:44:02 +00:00
let mut bytes = [0; 4 * 1024].to_vec();
let mut buffer = ReadBuf::new(&mut bytes);
match self.file.as_mut().poll_read(cx, &mut buffer) {
Poll::Ready(Ok(_)) => (),
Poll::Ready(Err(_)) => return Poll::Ready(Some(Err(UpstreamError))),
Poll::Pending => return Poll::Pending,
}
// At this point, we know that we "successfully" read some amount of
// data. Let's see if there's actual data in there...
let filled = buffer.filled().len();
if filled == 0 {
// Filled is zero, which indicates two situations:
// 1. We are actually done.
// 2. We read to the EOF while the writer is still writing to it.
2021-04-20 21:48:04 +00:00
// To handle the second case, we need to see the status of the
// writer, and if it's done writing yet.
if let Poll::Ready(Some(WritingStatus::Done(n))) =
self.receiver.as_mut().poll_next_unpin(cx)
{
2021-04-23 01:21:32 +00:00
self.bytes_total = Some(NonZeroU32::new(n).unwrap())
}
// Okay, now we know if we've read enough bytes or not. If the
// writer hasn't told use that it's done yet, then we know that
// there must be more bytes to read from.
if let Some(bytes_total) = self.bytes_total {
if bytes_total.get() == self.bytes_read {
// We matched the number of bytes the writer said it wrote,
// so we're finally done
return Poll::Ready(None);
2021-04-20 02:01:32 +00:00
}
2021-04-19 03:06:18 +00:00
}
// We haven't read enough bytes, so just return an empty bytes and
// have the executor request some bytes some time in the future.
//
// This case might be solved by io_uring, but for now this is this
// the best we can do.
Poll::Ready(Some(Ok(Bytes::new())))
} else {
// We have data! Give it to the reader!
2021-04-23 01:21:32 +00:00
self.bytes_read += filled as u32;
2021-04-20 21:48:04 +00:00
bytes.truncate(filled);
Poll::Ready(Some(Ok(bytes.into())))
2021-04-10 01:59:29 +00:00
}
}
}
2021-04-18 21:06:40 +00:00
impl From<UpstreamError> for actix_web::Error {
2021-04-18 21:11:30 +00:00
#[inline]
2021-04-18 21:06:40 +00:00
fn from(_: UpstreamError) -> Self {
2021-04-18 23:14:36 +00:00
PayloadError::Incomplete(None).into()
2021-04-18 21:06:40 +00:00
}
}
2021-04-20 02:01:32 +00:00
#[derive(Debug, Clone, Copy)]
2021-04-10 01:59:29 +00:00
enum WritingStatus {
NotDone,
2021-04-23 01:21:32 +00:00
Done(u32),
2021-04-10 01:59:29 +00:00
Error,
}